# Datetime

# Pandas and `datetime`

![](dtime.jpg)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller

plt.style.use('seaborn-deep')

import warnings
warnings.filterwarnings('ignore')

In [None]:
print(plt.style.available)

## Dates as types

<center>
    <img src = kick.jpg width = 500 height = 300 />
    </center>

In [None]:
kickstart = pd.read_csv('data/kickstart2018.csv')

In [None]:
kickstart.head(2)

In [None]:
kickstart[['launched', 'deadline']].info()

In [None]:
#convert to datetime
kickstart['launched'] = pd.to_datetime(kickstart['launched'])
kickstart['deadline'] = pd.to_datetime(kickstart['deadline'])

In [None]:
kickstart[['launched', 'deadline']].info()

### Using `datetime` objects

In [None]:
time_1 = kickstart.loc[4, 'launched']

In [None]:
time_1

In [None]:
time_1.hour

In [None]:
time_1.year

### Building Features with date attributes

In [None]:
kickstart['month_launched'] = kickstart['launched'].dt.month
kickstart['month_due'] = kickstart['deadline'].dt.month

In [None]:
kickstart

## Time Series

> *A time series is a series of data points indexed (or listed or graphed) in time order. Most commonly, a time series is a sequence taken at successive equally spaced points in time. Thus it is a sequence of discrete-time data. Examples of time series are heights of ocean tides, counts of sunspots, and the daily closing value of the Dow Jones Industrial Average.* -- [Wikipedia](https://en.wikipedia.org/wiki/Time_series)

In [None]:
ge = pd.read_csv('data/GE.csv')

In [None]:
ge.head(2)

In [None]:
ge['Adj Close'].plot( figsize = (15, 5))

## `.to_datetime()`

In [None]:
ge.info()

In [None]:
ge['Date'] = pd.to_datetime(ge.Date)

In [None]:
ge.info()

## Reset the index

In [None]:
ge.set_index('Date', inplace = True)
ge.head(2)

In [None]:
ge['Adj Close'].plot( figsize = (15, 5))

## `resample`

Convenience method for frequency conversion and resampling of time series.  -- [Pandas Documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html)



In [None]:
ge.resample('W')['Volume'].sum().head()  # similar to group by 

In [None]:
ge.resample('Q')['Adj Close'].mean().head()

In [None]:
ge.resample('Y')['Adj Close'].mean().head()

# Moving Average

In [None]:
import quandl

## Quandl and Financial Data

In [None]:
#!pip install quandl
# quandl.ApiConfig.api_key = 'your_api_key'

In [None]:
quandl.ApiConfig.api_key = 'YM-RT4CFETsfzRzTNXKt'

In [None]:
aapl = quandl.get('WIKI/AAPL')

In [None]:
aapl.head()

In [None]:
aapl['Adj. Close'].plot( figsize = (15, 5))

## Mean

In [None]:
aapl['Adj. Close'].mean()

In [None]:
aapl['Adj. Close'].max()

In [None]:
aapl['Adj. Close'].min()

In [None]:
aapl['Adj. Close'].plot( figsize = (15, 5))

### `.rolling()`

Pandas built-in rolling method

In [None]:
aapl['Adj. Close'].rolling(10)  # Rolling 10 days  -- 10 days Moving avarage 

In [None]:
aapl['Adj. Close'].rolling(10).mean().head()

In [None]:
aapl['Adj. Close'].rolling(10).mean().tail()

In [None]:
aapl['Adj. Close'].rolling(50).mean().plot( figsize = (15, 5))

### Volatility

In [None]:
aapl['Adj. Close'].rolling(10).std().plot( figsize = (15, 5))  # rolling the STD of the Data

### Stationarity

> *In mathematics and statistics, a stationary process (a.k.a. a strict/strictly stationary process or strong/strongly stationary process) is a stochastic process whose unconditional joint probability distribution does not change when shifted in time. Consequently, parameters such as mean and variance also do not change over time. -- [Wikipedia](https://en.wikipedia.org/wiki/Stationary_process)*

In [None]:
# Exmaple of trigometric function like the cosine with some random noise added to it

x = np.linspace(0, 6*np.pi, 200)
y = np.cos(x) + np.random.random(200)

In [None]:
plt.figure(figsize = (15, 6))
plt.subplot(121)
plt.plot(x, y, 'o')
plt.title('$f(x) = \cos{x} + \epsilon$')
plt.subplot(122)
plt.plot(aapl['Adj. Close'])
plt.title('AAPL Adj. Close');

### `.diff()`

$$\text{today's adjusted closing price} - \text{yesterday's adjusted closing price}$$

In [None]:
aapl['Adj. Close'].diff(1).head()

In [None]:
aapl['Adj. Close'].diff(1).plot( figsize = (15, 5))

In [None]:
aapl['Adj. Close'].diff(2).plot( figsize = (15, 5))

In [None]:
aapl['Adj. Close'].diff(10).plot( figsize = (15, 5))

### Testing for Stationarity

> *In statistics and econometrics, an augmented Dickey–Fuller test (ADF) tests the null hypothesis that a unit root is present in a time series sample. The alternative hypothesis is different depending on which version of the test is used, but is usually stationarity or trend-stationarity. It is an augmented version of the Dickey–Fuller test for a larger and more complicated set of time series models. -- [Wikipedia](https://en.wikipedia.org/wiki/Augmented_Dickey%E2%80%93Fuller_test)*

In [None]:
from statsmodels.tsa.stattools import adfuller
# Either fail to reject OR reject the null hypothesis test that the data is not - stationary

In [None]:
recent_data = aapl['2017':]['Adj. Close']

In [None]:
#testing original series
adfuller(recent_data)[1]

In [None]:
#testing first difference of series
adfuller(recent_data.diff(1).dropna())[1]

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize = (15, 5))
ax[0].plot(recent_data)
ax[0].set_title('Original Series')
ax[1].plot(recent_data.diff(1).dropna())
ax[1].set_title('Diff = 1')

# Autocorrelation and Partial Autocorrelation

### Autocorrelation

$$r_1 = \displaystyle \frac{\sum_{t = 1}^{N-1}(x_t - \bar{x}_{(1)})(x_{t+1} - \bar{x}_{(2)})}{\sqrt{\sum_{t=1}^{N-1}(x_t - \bar{x}_{(2)})^2}}$$

In [None]:
from pandas.plotting import autocorrelation_plot

In [None]:
# Finding correlation between successive observations in a time series
aapl['Adj. Close'].tail()

In [None]:
autocorrelation_plot(recent_data)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

In [None]:
plot_acf(recent_data, lags=300);

In [None]:
print(recent_data[250:].head(2))
fig, ax = plt.subplots(1, 2, figsize = (15, 6))
plt.figure(figsize = (15, 6))
plot_acf(recent_data[250:], ax = ax[0]);
plot_acf(recent_data[250:].diff().dropna(), ax = ax[1]);

### Partial Autocorrelation

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf

In [None]:
print(recent_data[250:].head(2))
fig, ax = plt.subplots(1, 2, figsize = (15, 6))
plt.figure(figsize = (15, 6))
plot_pacf(recent_data[250:], ax = ax[0]);
plot_pacf(recent_data[250:].diff().dropna(), ax = ax[1]);

### Airbnb Example

<center>
    <img src = 'airbnb.jpg' height = 400 width = 500 />
    </center>


In [None]:
bnb = pd.read_csv('data/listings.csv')

In [None]:
bnb.head(5)

In [None]:
revs = bnb.groupby('last_review')['id'].count()

In [None]:
revs.index = pd.to_datetime(revs.index)

In [None]:
revs = revs.sort_index(ascending = False)
revs.fillna(value = 0, inplace = True)

In [None]:
revs.head()

In [None]:
import seaborn as sns
plt.figure(figsize = (10, 5))
sns.barplot(revs.index.month, revs)  # change to time series object
plt.title('Monthly Review Count', fontsize = 16, loc = 'left');

In [None]:
#plot one year of reviews
fig, ax = plt.subplots(nrows = 2, ncols = 1)
revs.head(365).plot(figsize = (15, 6), ax = ax[0])
revs.rolling(10).mean().head(365).plot(ax = ax[0])

#two years of reviews
revs.head(730).plot(figsize = (15, 6), ax = ax[1])
revs.rolling(10).mean().head(730).plot(ax = ax[1])

In [None]:
#last year stationary?
adfuller(revs["2018":])[1]

In [None]:
#last year stationary?
adfuller(revs[-365:])[1]

In [None]:
#First diff test
adfuller(revs[-365:].diff().dropna())[1]

In [None]:
fig, ax = plt.subplots(nrows=1, ncols = 2, figsize = (15, 6))
plot_acf(revs[-365:], lags = 100, ax = ax[0], title = 'ACF');

plot_pacf(revs[-365:], lags = 100, ax = ax[1], title = 'Pacf');

In [None]:
fig, ax = plt.subplots(nrows=1, ncols = 2, figsize = (15, 6))
plot_acf(revs[-365:], lags = 100, ax = ax[0], title = 'Original Series');

plot_acf(revs[-365:].diff(1).dropna(), lags = 100, ax = ax[1], title = 'First Difference');

# Autocorrelation and Autoregression

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# plt.style.use('seaborn-darkgrid')

In [None]:
shampoo = pd.read_excel('data/ShampooSales.xls')

In [None]:
shampoo.head()

In [None]:
shampoo.info()

In [None]:
shampoo.set_index('Month', inplace = True)

In [None]:
shampoo.plot(figsize = (14, 5))

### Is it stationary?

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
adfuller(shampoo['Shampoo Sales'])[1]

In [None]:
adfuller(shampoo['Shampoo Sales'].diff(1).dropna())[1]

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 5))
plot_acf(shampoo['Shampoo Sales'], ax = ax[0]);
plot_acf(shampoo['Shampoo Sales'].diff(1).dropna(), ax = ax[1]);

### Autoregression

$$X_{t}=c+\varphi X_{{t-1}}+\varepsilon _{t}$$

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from statsmodels.tsa.arima_model import ARIMA

### Train/Test Split

![](ttest.png)

In [None]:
shampoo.plot(figsize = (15, 6))
plt.axvline('1997-07-01', color = 'black')
plt.text('1995-06-01', 400, 'Training Data', fontsize = 20)
plt.text('1997-08-01', 400, 'Test Data', fontsize = 20)
plt.title('Train/Test split in time series', loc = 'left')
plt.savefig('ttest.png')

In [None]:
#split data by time
train = shampoo[:-5]
test = shampoo[-5:]

In [None]:
#build AR model on dataset diffed by 1, using one previous term
#in autoregression specified by order
ar = ARIMA(train, order=(1, 1, 0))

# The first '#' is the number of terms to use in the autoregression 
# The second '#' is the number of diffirencing --How many diffs to make the data stationary
# The third '#' is the number of moving average

# Since the moving average is 0 we get an autoregression model
# If we change the frist # to 0 and set the last number to non-zero we get an moving average model

In [None]:
model = ar.fit()

In [None]:
#make predictions using all our data
preds = model.predict(start = len(train), end=len(train) + len(test) - 1)

In [None]:
preds

In [None]:
#predict 5 days ahead
forecast = model.forecast(steps = 5)

In [None]:
#get predictions
preds = forecast[0]

In [None]:
forecast
#The forcast actually returns a few pieces of information including confidence intervals around these values

In [None]:
forecast[0]

In [None]:
fdf = pd.DataFrame(preds, index = shampoo.tail().index)

In [None]:
fdf

In [None]:
shampoo['predictions'] = fdf

In [None]:
# fdf = shampoo['predictions']
# fdf

In [None]:
shampoo.plot(figsize = (12, 5))

In [None]:
shampoo.tail(8)

### Evaluating Results

In [None]:
model.summary()

In [None]:
#insample predictions
fig, ax = plt.subplots(1, 1, figsize = (12, 5))
model.plot_predict(ax = ax);

### MSE and RMSE

In [None]:
shampoo.tail()

In [None]:
#keep rows with both observations
error_df = shampoo.dropna()

In [None]:
#add a new error squared column
error_df['squared_error'] = (shampoo['Shampoo Sales'] - shampoo['predictions'])**2

In [None]:
error_df

In [None]:
#find mse and rmse
mse = np.mean(error_df['squared_error'])
rmse = np.sqrt(mse)
print(f'The MSE is {mse}\nThe RMSE is {rmse}')

In [None]:
def arima_summary(order_tup):
    ar = ARIMA(train, order=order_tup)
    model = ar.fit()
    
    #predict 7 days ahead
    forecast = model.forecast(steps = 5)
    
    #get predictions
    preds = forecast[0]
    res_df = pd.read_excel('data/ShampooSales.xls', index = 'Month')
    fdf = pd.DataFrame(preds, index = res_df.tail().index)
    res_df['predictions'] = fdf
    
    #keep rows with both observations
    error_df = res_df.tail()
    
    #compute and print errors
    error_df['squared_error'] = (error_df['Shampoo Sales'] - error_df['predictions'])**2
    mse = np.mean(error_df['squared_error'])
    rmse = np.sqrt(mse)
    print(f'The MSE is {mse}\nThe RMSE is {rmse}')
    res_df.set_index('Month', inplace = True)
    res_df.iloc[-5:, 1] = preds
    res_df.plot(figsize = (15, 5))
    error_df.set_index('Month', inplace = True)
    return error_df

In [None]:
arima_summary((1, 1, 0))

In [None]:
arima_summary((2, 1, 0))

In [None]:
arima_summary((3, 1, 0))

# ARIMA Model

---------

**A**uto **R**egressive **I**ntegrated **M**oving **A**verage



> *The AR part of ARIMA indicates that the evolving variable of interest is regressed on its own lagged (i.e., prior) values. The MA part indicates that the regression error is actually a linear combination of error terms whose values occurred contemporaneously and at various times in the past. The I (for "integrated") indicates that the data values have been replaced with the difference between their values and the previous values (and this differencing process may have been performed more than once). The purpose of each of these features is to make the model fit the data as well as possible*. -- [Wikipedia](https://en.wikipedia.org/wiki/Autoregressive_integrated_moving_average)

In [None]:

# import pandas as pd
# plt.style.use('seaborn-darkgrid')
# from statsmodels.tsa.arima_model import ARIMA
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# from statsmodels.tsa.stattools import adfuller
# import warnings
# warnings.filterwarnings('ignore')

In [None]:
shampoo = pd.read_excel('data/ShampooSales.xls')
shampoo.set_index('Month', inplace = True)

In [None]:
shampoo_diff = shampoo.diff().dropna()
train = shampoo_diff[:-5]
test = shampoo_diff[-5:]

### The Autoregressive Model

In [None]:
ar = ARIMA(train, order = (1, 1, 0))

In [None]:
model = ar.fit()

In [None]:
forecast = model.forecast(steps = 5)[0]

In [None]:
plt.figure(figsize = (15, 5))
plt.plot(test.index, forecast)
plt.plot(test)
plt.xticks(rotation = 80);

### ARIMA Model Parameters

```
order(p, d, q)
```

- d: Augmented Dicky Fuller determines amount of differencing

- p: The lag beyond which the PACF cuts off is the indicated number of AR terms.

- q: The lag beyond which the ACF cuts off is the indicated number of MA terms.

In [None]:
adfuller(shampoo['Shampoo Sales'])[1]

In [None]:
adfuller(shampoo['Shampoo Sales'].diff().dropna())[1]

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (15, 5))
plot_acf(shampoo['Shampoo Sales'].diff().dropna(), ax = ax[0])

plot_pacf(shampoo['Shampoo Sales'].diff().dropna(), ax = ax[1]);

In [None]:
arima = ARIMA(train, order=(1, 1, 1))

In [None]:
model = arima.fit()

In [None]:
model.plot_predict();

In [None]:
model.summary()

In [None]:
def arima_summary(order_tup):
    ar = ARIMA(train, order=order_tup)
    model = ar.fit()
    #predict 7 days ahead
    forecast = model.forecast(steps = 5)
    #get predictions
    preds = forecast[0]
    res_df = pd.read_excel('data/ShampooSales.xls', index = 'Month')
    fdf = pd.DataFrame(preds, index = res_df.tail().index)
    res_df['predictions'] = fdf 
    #keep rows with both observations
    
    res_df['predictions'] = res_df['Shampoo Sales'] + res_df['predictions']
    #compute and print errors
    
    
    res_df.set_index('Month', inplace = True)
    res_df.iloc[-5:, 1] = preds
    res_df['predictions'] = res_df['Shampoo Sales'] + res_df['predictions']
    error_df = res_df.tail()
    error_df['squared_error'] = (error_df['Shampoo Sales'] - error_df['predictions'])**2
    res_df.plot(figsize = (15, 5))
    #error_df.set_index('Month', inplace = True)
    mse = np.mean(error_df['squared_error'])
    rmse = np.sqrt(mse)
    print(f'The MSE is {mse}\nThe RMSE is {rmse}')
    return res_df.tail()

In [None]:
arima_summary((1, 1, 1))

### `pmdarima`

![](pmdarima.png)

In [None]:
#pip install pmdarima
import pmdarima as pm

In [None]:
# fit stepwise auto-ARIMA
stepwise_fit = pm.auto_arima(shampoo, start_p=1, start_q=1,
                             max_p=3, max_q=3, m=12,
                             start_P=0, seasonal=False,
                             d=1, D=1, trace=True,
                             error_action='ignore',  # don't want to know if an order does not work
                             suppress_warnings=True,  # don't want convergence warnings
                             stepwise=True)

In [None]:
stepwise_fit.summary()

In [None]:
stepwise_fit.plot_diagnostics(figsize = (15, 10));