# Time Series Model Building

## Steps to be followed in Data Ingestion
- Import the required Libraries: Import pandas,numpy,matplotlib,seaborn,etc
- Load the data
- Load the time series data into pandas dataframe
- Set the datetime column as the index of the dataframe
- Check the datatype of the index and convert it to datetime if necessary.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

In [None]:
data=pd.read_csv('TSLA.CSV')

In [None]:
data.head()

In [None]:
## Performing univariate analysis
stock_data=data[["Date","Close"]]

In [None]:
stock_data.info()

In [None]:
stock_data.iloc[:,0]=pd.to_datetime(stock_data.Date)

In [None]:
stock_data.info()

In [None]:
stock_data=stock_data.set_index("Date")

In [None]:
stock_data

## Performing EDA Of the Data

In [None]:
stock_data.describe()

In [None]:
plt.figure(figsize=(14,8))
plt.plot(stock_data.Close)


In [None]:
plt.hist(stock_data.Close)

In [None]:
import seaborn as sns

In [None]:
sns.distplot(stock_data.Close)

In [None]:
## plotting close price
plt.style.use('ggplot')
plt.figure(figsize=(18,8))
plt.grid(True)
plt.xlabel('Dates',fontsize=20)
plt.xticks(fontsize=12)
plt.ylabel('Close Price', fontsize = 20)
plt.yticks(fontsize=15)
plt.plot(stock_data['Close'],linewidth=3,color='blue')
plt.title('Tesla Stock Closing Price', fontsize=30)
plt.show()

In [None]:
## plotting close price
plt.style.use('ggplot')
plt.figure(figsize=(18,8))
plt.grid(True)
plt.xlabel('Dates',fontsize=20)
plt.xticks(fontsize=12)
plt.ylabel('Close Price', fontsize = 20)
plt.yticks(fontsize=15)
plt.hist(stock_data['Close'],linewidth=3,color='blue')
plt.title('Tesla Stock Closing Price', fontsize=30)
plt.show()

In [None]:
## plotting close price
df_close=stock_data['Close']
df_close.plot(kind='kde',figsize=(18,8),linewidth=3)
plt.grid("both")
plt.xticks(fontsize=12)
plt.ylabel('Density', fontsize = 20)
plt.yticks(fontsize=15)
plt.show()

In [None]:
rolmean=stock_data['Close'].rolling(12).mean()

In [None]:
rolmean

In [None]:
rolstd=stock_data['Close'].rolling(12).std()

In [None]:
rolstd

In [None]:

plt.plot(stock_data['Close'])

plt.plot(rolmean)
plt.plot(rolstd)

In [None]:
from statsmodels.tsa.stattools import adfuller
adft=adfuller(stock_data.Close)

In [None]:
pd.Series(adft[0:4],index=['test stats','p value','lag','data points'])

In [None]:
# test for stationary
def test_stationarity(timeseries):
    # Determining rolling statistics
    rolmean=timeseries.rolling(48).mean() # rolling mean
    rolstd = timeseries.rolling(48).std() # rolling standard deviation
    # Plot rolling statistics:
    plt.figure(figsize=(18,8))
    plt.grid('both')
    plt.plot(timeseries,color='blue',label='Original',linewidth=3)
    plt.plot(rolmean,color='red',label='Rolling Mean',linewidth=3)
    plt.plot(rolstd,color='black',label='Rolling Std',linewidth=4)
    plt.legend(loc='best',fontsize=20,shadow=True,facecolor='lightpink',edgecolor='k')
    plt.title('Rolling Mean and Standard Deviation',fontsize=25)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.show()
    
    print("Results of Dickey Fuller Test")
    adft =adfuller(timeseries,autolag='AIC')
    #output for dft will give us without defining what the values are.
    #hence we manually write what values does it explains using a for loop
    output = pd.Series(adft[0:4],index=['Test Statistics','P_Value','No_Of_Lags','Number of Data Point'])
    for key,values in adft[4].items():
        output['critical value (%s)'%key] = values
    print(output)
    
    

In [None]:
test_stationarity(stock_data.Close)

In [None]:
## To check outliers

In [None]:
sns.boxplot(stock_data.Close)

In [None]:
## Time series Decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
result=seasonal_decompose(stock_data[['Close']],period=12)

In [None]:
result.plot()

In [None]:
result.seasonal

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

In [None]:
plot_acf(stock_data.Close)

In [None]:
plot_pacf(stock_data.Close)

In [None]:
stock_data.isnull().sum()

In [None]:
## convert the data into stationary time series

In [None]:
df_close=stock_data['Close']

In [None]:
df_close

In [None]:
df_close=df_close.diff()
df_close=df_close.dropna()

In [None]:
test_stationarity(df_close)

Since the p value is less than 0.05 we reject the null hypothesis
Hence we can say that our conversion from non stationary to stationary is successful 

In [None]:
df_close[0:-60]

In [None]:
# split the data into train and test 
train_data=df_close[0:-60]
test_data=df_close[-60:]
plt.figure(figsize=(18,8))
plt.grid(True)
plt.xlabel('Dates',fontsize=20)
plt.ylabel('Closing Prices',fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.plot(train_data,'green',label='Train Data',linewidth=5)
plt.plot(test_data,'blue',label="Test Data",linewidth=5)
plt.legend(fontsize=20,shadow=True,facecolor='lightpink',edgecolor='k')

In [None]:
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

In [None]:
history = [x for x in train_data]

In [None]:
history

In [None]:
model=ARIMA(history,order=(1,1,1))

In [None]:
model=model.fit()

In [None]:
model.summary()

In [None]:
model.forecast()

In [None]:
mse =mean_squared_error([test_data[0]],model.forecast())

In [None]:
mse

In [None]:
np.sqrt(mse)

In [None]:
def train_arima_model(X,y,arima_order):
    #prepare training dataset
    # make prediction lists
    history = [x for x in X]
    predictions = list()
    for t in range(len(y)):
        model = ARIMA(history, order=arima_order)
        model_fit=model.fit()
        yhat=model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(y[t])
    # calculate out the sample error
    rmse = np.sqrt(mean_squared_error(y,predictions))
    return rmse

In [None]:
#evaluate different combinations of p,d,q values for an ARIMA model to get the best order for ARIMA model
def evaluate_models(dataset,test,p_values,d_values,q_values):
    dataset=dataset.astype('float32')
    best_score,best_cfg = float('inf'),None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    rmse = train_arima_model(dataset,test,order)
                    if rmse < best_score:
                        best_score,best_cfg = rmse,order
                    print('ARIMA%s RMSE=%.3f' %(order,rmse))
                except:
                    continue
    print('Best ARIMA%s RMSE=%.3f' %(best_cfg,best_score))

In [None]:
import warnings
warnings.filterwarnings('ignore')
p_values=range(0,3)
d_values=range(0,3)
q_values=range(0,3)
evaluate_models(train_data,test_data,p_values,d_values,q_values)

In [None]:
history = [x for x in train_data]
predictions = list()
for i in range(len(test_data)):
    model=ARIMA(history,order=(2,0,0))
    model=model.fit()
    fc=model.forecast()
    predictions.append(fc)
    history.append(test_data[i])
print(f"my RMSE {np.sqrt(mean_squared_error(test_data,predictions))}")

In [None]:
plt.figure(figsize=(18,8))
plt.grid(True)
plt.plot(range(len(test_data)),test_data,label="True Test Close Value",linewidth=5)
plt.plot(range(len(predictions)),predictions,label="Predictions on Test Data",linewidth=5)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=20,shadow=True,facecolor='lightpink',edgecolor='k')
plt.show()

In [None]:
fc_series=pd.Series(predictions,index=test_data.index)

In [None]:
#plot
plt.figure(figsize=(18,8),dpi=200)
plt.plot(train_data,label="Training",color='blue')
plt.plot(test_data,label="Test",color='green',linewidth=3)
plt.plot(fc_series,label="Forecast",color='red')
plt.title('Forecast vs Actuals on Test Data')
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_predict
fig=plt.figure(figsize=(18,8))
ax1=fig.add_subplot(111)
plot_predict(result=,start=1,end=len(df_close)+60,ax=ax1)
plt.grid('both')
plt.legend(['Forecast','Close','95% Confidence Interval'],fontsize=20, shadow=True, facecolor='lightblue',edgecolor='k')
plt.show()