In [1]:
pip install pmdarima

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

#Used for Google Colab
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

#Read Data

In [None]:
# Dataset merging and cleaning
# data = pd.read_excel('/content/drive/My Drive/MOPAC/Crime/Borough Daily 2018-20.xlsx')
# data2021 = pd.read_excel('/content/drive/My Drive/MOPAC/Crime/Borough Daily Data 2021.xlsx')

data = pd.read_excel('../Crime/Borough Daily 2018-20.xlsx')
data2021 = pd.read_excel('../Crime/Borough Daily Data 2021.xlsx')


df = pd.concat([data, data2021])
data = df
data = data.fillna(data.mean())
data = data.rename(columns={'Date - Daily Data': 'date'})
# data = data.rename(columns={'Month-Year': 'date'})
data = data.groupby(by="date").sum()

df = data["TNO Offs"].to_frame()
df.size

#Plot Your Data

In [None]:
df['TNO Offs'].plot(figsize=(12,5))

In [None]:
df

#Check For Stationarity

In [None]:
from statsmodels.tsa.stattools import adfuller

def adf_test(dataset):
  dftest = adfuller(dataset, autolag = 'AIC')
  print("1. ADF : ",dftest[0])
  print("2. P-Value : ", dftest[1])
  print("3. Num Of Lags : ", dftest[2])
  print("4. Num Of Observations Used For ADF Regression and Critical Values Calculation :", dftest[3])
  print("5. Critical Values :")
  for key, val in dftest[4].items():
      print("\t",key, ": ", val)
  
  if dftest[0] < dftest[4]["5%"]:
      print ("Reject Ho - Time Series is Stationary")
  else:
      print ("Failed to Reject Ho - Time Series is Non-Stationary")

In [None]:
adf_test(df['TNO Offs'])

#Figure Out Order for ARIMA Model

In [None]:
from pmdarima import auto_arima
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
stepwise_fit = auto_arima(df['TNO Offs'], 
                          suppress_warnings=True)           

stepwise_fit.summary()

In [None]:
from statsmodels.tsa.arima_model import ARIMA

#Split Data into Training and Testing

In [None]:
print(df.shape)
train=df.iloc[:-200]
test=df.iloc[-200:]
print(train.shape,test.shape)
print(test.iloc[0],test.iloc[-1])

## Train the Model

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
model=SARIMAX(train['TNO Offs'],trend='c',order=(2,1,2))
model=model.fit()
model.summary()

#Make Predictions on Test Set

In [None]:
import matplotlib.pyplot as plt
start=len(train)
end=len(train)+len(test)-1
print(test)
#if the predicted values dont have date values as index, you will have to uncomment the following two commented lines to plot a graph
index_future_dates=pd.date_range(start='2020-12-13',end='2021-06-30')
pred=model.predict(start=start,end=end,typ='levels').rename('ARIMA predictions')
pred.index=index_future_dates
pred.plot(legend=True)
test['TNO Offs'].plot(legend=True)

In [None]:
pred.plot(legend='ARIMA Predictions')
test['TNO Offs'].plot(legend=True)

In [None]:
test['TNO Offs'].mean()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
rmse=sqrt(mean_squared_error(pred,test['TNO Offs']))
print(rmse)
print(r2_score(test["TNO Offs"], pred))


In [None]:
model2=SARIMAX(df['TNO Offs'],trend='c',order=(2,1,2))
model2=model2.fit()
df.tail()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
rmse=sqrt(mean_squared_error(pred,test['TNO Offs']))
print(rmse)
print(r2_score(test["TNO Offs"], pred))

#For Future Dates

In [None]:
index_future_dates=pd.date_range(start='2021-07-01',end='2021-07-31')
#print(index_future_dates)
pred=model2.predict(start=len(df),end=len(df)+30,typ='levels').rename('ARIMA Predictions')
# print(comp_pred)
pred.index=index_future_dates
print(pred)

In [None]:
pred.plot(figsize=(12,5),legend=True)

In [None]:
pred.plot(figsize=(12,5),legend=True)
test['TNO Offs'].plot(legend=True)