# Implementation of Arima Model

## Imports

In [None]:
import pandas as pd
import csv
from matplotlib import pyplot as plt
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import datetime

## Data Import and Formatting

First we load in our dataset from a csv file. For now, we will use univariate data for ARIMA (just soybean futures prices).

In [None]:
raw_data = pd.read_csv("DATA/yahoo_finance_futures/yf_futures.csv").set_index("Date")
raw_data.index = pd.to_datetime(raw_data.index)

dataset_name="assorted_futures"

soybean_futures = raw_data["Soybean"]

soybean_futures.dropna(inplace=True)

soybean_futures.index = soybean_futures.index.to_period('M')

#plot our data
soybean_futures.plot()

Let's calculate autoregression and look at an autoregression plot to estimate optimal values of p.

In [None]:
autocorrelation = [soybean_futures.autocorr(x) for x in range(5500)]

ac_plot = autocorrelation_plot(soybean_futures)

# lags = ac_plot.lines[-1].get_xdata()
# autocorrs = ac_plot.lines[-1].get_ydata()


We can experiment with different differencing values.

In [None]:
d = 2

print(soybean_futures)

diffed_sf = pd.Series(soybean_futures).rename("diff_{D}".format(D=d))

for i in range(d):

    diffed_sf = diffed_sf.diff()


diffed_sf.plot(color="green")
soybean_futures.plot()



Let us finally perform a train/test split.

In [None]:
train, test = train_test_split(soybean_futures, test_size=0.2, shuffle=False)

## Model Implementation, Testing, and Evaluation

Below, we iterate through every test timestep, and for each timestep use all previous data to build and fit an ARIMA model. Then we log all the predictions, and compare it to the true values.

In [None]:
pred = []
history = list(train)

test = test[0:100]

print("Starting {lent} predictions".format(lent=(len(test))))

for ts in range(len(test)):
    print("Fitting and testing on {ts}".format(ts=ts), end='\r')
    model = ARIMA(history, order=(5, 2, 5))
    model_fit = model.fit(method_kwargs={'maxiter': 500})
    output = model_fit.forecast()[0]
    pred.append(output)
    history.append(test[ts])

#write to training log
log_entry = [
    dataset_name,
    "ARIMA",
    datetime.now(),
    "NaN",
    "NaN",
    "NaN",
    mean_absolute_error(test, pred)
]

with open('SAVED_MODELS/training_log.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(log_entry)

plt.title("ARIMA model performance - {ds}".format(ds=dataset_name))
plt.xlabel("Timestep")
plt.ylabel("Price")
plt.plot(list(test), label="True Value")
plt.plot(pred, label="Predicted Value")
plt.legend(loc="upper left")
plt.show()



In [None]:
mean_absolute_error(test, pred)