In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


#Read the input files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
store = pd.read_csv('stores.csv')
feature = pd.read_csv('features.csv')


train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
feature['Date']=pd.to_datetime(feature['Date'])

# Splitting month, year and day - train
train['Month']=train['Date'].dt.month
train['Year']=train['Date'].dt.year
train['Dayofweek']=train['Date'].dt.dayofweek

# Splitting month, year and day - test
test['Month']=test['Date'].dt.month
test['Year']=test['Date'].dt.year
test['Dayofweek']=test['Date'].dt.dayofweek

# set the dates as the index of the dataframe, so that it can be treated as a time-series dataframe
train.set_index('Date',inplace=True)
test.set_index('Date',inplace=True)

#Merge train and feature
merge_df=pd.merge(train,feature, on=['Store','Date','IsHoliday'], how='inner')
merge_df = pd.merge(merge_df, store, on='Store', how='inner')

def CompareModels(data):
    features = ['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']

    train_set=data[features + ['Weekly_Sales']]
    X_train, X_test, y_train, y_test = train_test_split(train_set[features], train_set['Weekly_Sales'], test_size=0.2,
                                                        random_state=42)
    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # XGBoost
    xgb_model = XGBRegressor(n_estimators=100, random_state=42)
    xgb_model.fit(X_train, y_train)

    # Making predictions
    rf_preds = rf_model.predict(X_test)
    mae = mean_absolute_error(y_test, rf_preds)
    print("Random Forest - MAE :"+str(mae))
    rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
    print("Random Forest - RMSE :"+str(rmse))
    accuracy = 1 - mae / np.mean(y_test)
    print("Random Forest - Accuracy : "+str(+accuracy*100)+"%")

    xgb_preds = xgb_model.predict(X_test)
    mae = mean_absolute_error(y_test, xgb_preds)
    print("XGBoost - MAE :" + str(mae))
    rmse = np.sqrt(mean_squared_error(y_test, xgb_preds))
    print("XGBoost - RMSE :" + str(rmse))
    accuracy = 1 - mae / np.mean(y_test)
    print("XGBoost - Accuracy : " + str(accuracy*100)+"%")
    # Plotting results
    plt.figure(figsize=(14, 7))

    plt.plot(y_test.index, y_test.values, label='Actual Sales', color='black')
    plt.plot(y_test.index, rf_preds, label='Random Forest Predictions', color='blue')
    plt.plot(y_test.index, xgb_preds, label='XG Boost Predictions', color='red')

    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.title(f'Sales Prediction Comparison')
    plt.legend()
    plt.show()

#print("Comparing models")

#CompareModels(merge_df)

def PredictAndPlot(model, X_test, model_name):
    print("Making predictions")
    preds = model.predict(X_test)


    # Create a DataFrame with test predictions
    predictions_df = pd.DataFrame({
        'Date': test.index,
        'Predicted_Weekly_Sales': preds
    })

    # Aggregate predictions by taking the mean for each week
    aggregated_predictions = predictions_df.groupby('Date').mean()
    # Plotting results
    plt.figure(figsize=(14, 7))
    plt.plot(aggregated_predictions.index, aggregated_predictions['Predicted_Weekly_Sales'], label='Predictions',
             color='blue')
    plt.xlabel('Date')
    plt.ylabel('Weekly Sales')
    plt.title(f'{model_name} - Sales Prediction')
    plt.legend()
    plt.show()


y_train=train["Weekly_Sales"]
X_train=train.drop("Weekly_Sales", axis=1)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and plot for Random Forest
PredictAndPlot(rf_model, test, 'Random Forest')

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and plot for XGBoost
PredictAndPlot(xgb_model, test, 'XGBoost')