# Sales Forecasting
## Using Machine Learning to predict sales

Kaggle [Dataset](https://www.kaggle.com/competitions/demand-forecasting-kernels-only/data?select=train.csv)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
color_pallet = sns.color_palette()
plt.style.use('fivethirtyeight')

In [None]:
df = pd.read_csv('/content/train.csv')

# EDA

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
f'Total dates: {df[["date"]].nunique()[0]}'

In [None]:
f'Total store: {df[["store"]].nunique()[0]}'

In [None]:
f'Total items: {df[["item"]].nunique()[0]}'

## Checking null data

In [None]:
df[df.isna().any(axis=1) | df.isnull().any(axis=1)]


In [None]:
# Sales Trends Over Time
plt.figure(figsize=(15, 5))
sales_trend = df.groupby('date')['sales'].sum()
plt.plot(sales_trend.index, sales_trend.values, label='Total Sales', color=color_pallet[0])
plt.title('Sales Trend Over Time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Distribution of Sales
plt.figure(figsize=(8, 5))
sns.histplot(df['sales'], bins=30, kde=True, color='green')
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
# Sales Boxplot by Store
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='store', y='sales')
plt.title('Sales Distribution by Store')
plt.xlabel('Store')
plt.ylabel('Sales')
plt.grid(True)
plt.show()


## Setting date as index

In [None]:
df.set_index('date', inplace = True)
df.index = pd.to_datetime(df.index)

df.head()

# Feature engineering

In [None]:
def create_features(df):
    """
    Creating time series features based on dataframe index.
    """
    df = df.copy()
    # df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df

df = create_features(df)

In [None]:
df.head()

# Train and Test Split

In [None]:
train = df.loc[df.index < '01-01-2017']
test = df.loc[df.index >= '01-01-2017']

# Creating Model

In [None]:
train = create_features(train)
test = create_features(test)

FEATURES = ['store', 'item', 'dayofyear', 'dayofweek', 'quarter', 'month', 'year']
TARGET = 'sales'

X_train = train[FEATURES]
y_train = train[TARGET]

X_test = test[FEATURES]
y_test = test[TARGET]

# Linear Regression

In [None]:
linreg_model = LinearRegression()
linreg_model.fit(X_train, y_train)

In [None]:
test['prediction_lr'] = linreg_model.predict(X_test)

In [None]:
linreg_rmse = np.sqrt(mean_squared_error(test['sales'], test['prediction_lr']))
linreg_mae = mean_absolute_error(test['sales'], test['prediction_lr'])
linreg_r2 = r2_score(test['sales'], test['prediction_lr'])
print('Linear Regression RMSE: ', linreg_rmse)
print('Linear Regression MAE: ', linreg_mae)
print('Linear Regression R2 Score: ', linreg_r2)

# Random Forest

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, max_depth=40)
rf_model.fit(X_train, y_train)

In [None]:
test['prediction_rf'] = rf_model.predict(X_test)

In [None]:
rf_rmse = np.sqrt(mean_squared_error(test['sales'], test['prediction_rf']))
rf_mae = mean_absolute_error(test['sales'], test['prediction_rf'])
rf_r2 = r2_score(test['sales'], test['prediction_rf'])
print('Random Forest RMSE: ', rf_rmse)
print('Random Forest MAE: ', rf_mae)
print('Random Forest R2 Score: ', rf_r2)

# XGBOOST

In [None]:
reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                       n_estimators=4000,
                       early_stopping_rounds=50,
                       objective='reg:linear',
                       max_depth=4,
                       learning_rate=0.01)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100)

# Feature Importance

In [None]:
fi = pd.DataFrame(data=reg.feature_importances_,
             index=reg.get_booster().feature_names,
             columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()

# Forecast on Test

In [None]:
test['prediction_xg'] = reg.predict(X_test)
# df = df.merge(test[['prediction_xg']], how='left', left_index=True, right_index=True)

# Score (RMSE)

In [None]:
xg_rmse = np.sqrt(mean_squared_error(test['sales'], test['prediction_xg']))
xg_mae = mean_absolute_error(test['sales'], test['prediction_xg'])
xg_r2 = r2_score(test['sales'], test['prediction_xg'])
print('Random Forest RMSE: ', xg_rmse)
print('Random Forest MAE: ', xg_mae)
print('Random Forest R2 Score: ', xg_r2)

# Comparing Linear Regression, Random Forest, XG Boost

In [None]:
linreg_stats = [linreg_rmse, linreg_mae, linreg_r2]
rf_stats = [rf_rmse, rf_mae, rf_r2]
xgb_stats = [xg_rmse, xg_mae, xg_r2]

In [None]:
plt.figure(figsize=(15,7))
plt.plot(linreg_stats)
plt.plot(rf_stats)
plt.plot(xgb_stats)
plt.title("Model Comparison between Linear Regression, Random Forest and XGboost")
plt.xticks([0,1,2], labels=['RMSE','MAE','R2 Score'])
plt.legend(["Linear Regression", "Random Forest", "XG Boost"])
plt.show()