In [5]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import rmse

# Load the dataset
data= pd.read_csv("/Users/pranavchundi/Downloads//Datathon_Fall2023_Dataset.csv")




In [6]:
import warnings

# Suppress the warnings
warnings.filterwarnings("ignore")

# Update the metrics dataframe with CV metrics
metrics_df['CV'] = [rmse_cv, mae_cv, mape_cv]

metrics_df

Unnamed: 0,Metric,Train,Test,CV
0,RMSE,0.03984,0.100399,0.082396
1,MAE,0.03096,0.076928,0.060205
2,MAPE,29.453911,7.858106,6.265856


In [7]:
# Preprocess the data
data['Date'] = pd.to_datetime(data['Year'].astype(str), format='%Y%m')
data.set_index('Date', inplace=True)
data.drop('Year', axis=1, inplace=True)

# Define the MAPE function
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Split data into train and test
train = data[:-12]
test = data[-12:]

# Fit ARIMA model to training data
model_train = ARIMA(train, order=(1,1,1))
result_train = model_train.fit()

# Forecast and compute metrics for training set
forecast_train = result_train.predict(start=1, end=len(train))
rmse_train = rmse(train['Anomaly'], forecast_train)
mae_train = mean_absolute_error(train['Anomaly'], forecast_train)
mape_train = mean_absolute_percentage_error(train['Anomaly'], forecast_train)

# Forecast and compute metrics for test set
forecast_test = result_train.forecast(steps=12)
rmse_test = rmse(test['Anomaly'], forecast_test)
mae_test = mean_absolute_error(test['Anomaly'], forecast_test)
mape_test = mean_absolute_percentage_error(test['Anomaly'], forecast_test)

# Time series cross-validation (rolling-forecast origin approach)
cv_predictions = []
for t in range(len(test)):
    train_temp = data.iloc[:len(train)+t]
    model_temp = ARIMA(train_temp, order=(1,1,1))
    result_temp = model_temp.fit()
    forecast_temp = result_temp.forecast(steps=1)
    cv_predictions.append(forecast_temp[0])
rmse_cv = rmse(test['Anomaly'], cv_predictions)
mae_cv = mean_absolute_error(test['Anomaly'], cv_predictions)
mape_cv = mean_absolute_percentage_error(test['Anomaly'], cv_predictions)

# Compile metrics into a dataframe
metrics_df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE', 'MAPE'],
    'Train': [rmse_train, mae_train, mape_train],
    'Test': [rmse_test, mae_test, mape_test],
    'CV': [rmse_cv, mae_cv, mape_cv]
})
