### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import GridSearchCV

### Importing the data

In [2]:
data = pd.read_csv('Final_data.csv')

In [None]:
# First 5 rows of the data
data.head()

In [3]:
# This will remove "customer" from all the values in 'Customer_Name' column.
data['Customer_Name']=data['Customer_Name'].apply(lambda x:x.lstrip('customer'))

# This will remove "Item_" from all the values in 'Item_Name' column.
data['Item_Name']=data['Item_Name'].apply(lambda x:x.lstrip('Item_'))

In [None]:
data.head()

In [4]:
# # Making a list with all the independent columns used for model building
indep_var=['Customer_Name','Item_Name'
           ,'Price_per_unit','Amount']

In [5]:
# Converting the data type of the 'Customer_Name' and 'Item_Name' columns in the 'data' DataFrame to int
data['Customer_Name'] = data['Customer_Name'].astype('int')
data['Item_Name'] = data['Item_Name'].astype('int')

### Train - Test split of the data

In [6]:
# Assigning the dependent column to y and independent columns to x
y = data.Quantity
x = data[indep_var]

In [7]:
# Train-test split on the data
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3,random_state=123)

In [8]:
# A function to calculate and print out four evaluation metrics,
# namely Root Mean Squared Error (RMSE), Mean Absolute Error (MAE), R-squared (R^2) Score, and Adjusted R-squared (R^2) Score

def model_result(actual,pred):
    rmse = np.sqrt(mean_squared_error(actual,pred))
    mae = mean_absolute_error(actual,pred)
    r2 = r2_score(actual,pred)
    adj_r2 = 1-(((1-r2)*(len(actual)-1))/(len(actual)-4-1))
    
    print("Evaluation Metrics")
    print("**********************************************************")
    print("Root Mean Squared Error (RMSE):", rmse)
    print("Mean Absolute Error (MAE):", mae)
    print("R-squared (R^2) Score:", r2)
    print("Adjusted (R^2) Score:", adj_r2)

    metrics = [rmse, mae, r2, adj_r2]
    return metrics

In [9]:
# Inside the function, the input X and Y data are fitted to the model and predicted values are generated using the testing data. 
# The function then calls on the "model_result" function to calculate and print evaluation metrics
# Finally, the evaluation metrics are appended to a dictionary with the model name as key.

validation_dict = {}
def different_model(model):
    ml = model
    ml.fit(X_train, Y_train)
    pred_y = ml.predict(X_test)
    return_list = model_result(Y_test, pred_y)
    validation_dict[model] = return_list

#### LinearRegression()

In [10]:
different_model(LinearRegression())

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 251.43597839852455
Mean Absolute Error (MAE): 115.39343342928703
R-squared (R^2) Score: 0.8040794301719907
Adjusted (R^2) Score: 0.8027352067083336


#### DecisionTreeRegressor()

In [11]:
different_model(DecisionTreeRegressor(random_state=123))

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 104.36343937280473
Mean Absolute Error (MAE): 28.461955782312927
R-squared (R^2) Score: 0.9662462555436037
Adjusted (R^2) Score: 0.9660146689607124


#### RandomForestRegressor()

In [12]:
different_model(RandomForestRegressor(random_state=123))

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 66.11507848077238
Mean Absolute Error (MAE): 17.568736904761906
R-squared (R^2) Score: 0.9864535272601985
Adjusted (R^2) Score: 0.986360584051006


#### SVR()

In [13]:
different_model(SVR())

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 540.2239610971161
Mean Absolute Error (MAE): 207.06847655460976
R-squared (R^2) Score: 0.0955743352589481
Adjusted (R^2) Score: 0.08936901337393233


#### KNeighborsRegressor()

In [14]:
different_model(KNeighborsRegressor())

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 261.4342442640175
Mean Absolute Error (MAE): 109.84280612244898
R-squared (R^2) Score: 0.7881882058828777
Adjusted (R^2) Score: 0.7867349517208392


#### XGBRegressor()

In [15]:
different_model(XGBRegressor(random_state=123))

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 55.01585592993867
Mean Absolute Error (MAE): 16.53807866500754
R-squared (R^2) Score: 0.9906200410039252
Adjusted (R^2) Score: 0.9905556845099556


In [16]:
# Creating a dataframe with model name and its performance through various evaluation metrics

rmse = []
mae = []
r2_Score = []
adjusted_r2score=[]
model = []
col = ['Algorithm', 'RMSE', 'MAE', 'R2_SCORE', 'ADJUSTED_R2_SCORE']
df_results = pd.DataFrame(columns=col)


for model_name, scores in validation_dict.items():
    model.append(model_name)
    rmse.append(scores[0])
    mae.append(scores[1])
    r2_Score.append(scores[2])
    adjusted_r2score.append(scores[3])

df_results['Algorithm'] = model
df_results['RMSE'] = rmse
df_results['MAE'] = mae
df_results['R2_SCORE'] = r2_Score
df_results['ADJUSTED_R2_SCORE'] = adjusted_r2score

df_results.reset_index(drop=True, inplace=True)

df_results

Unnamed: 0,Algorithm,RMSE,MAE,R2_SCORE,ADJUSTED_R2_SCORE
0,LinearRegression(),251.435978,115.393433,0.804079,0.802735
1,DecisionTreeRegressor(random_state=123),104.363439,28.461956,0.966246,0.966015
2,"(DecisionTreeRegressor(max_features='auto', ra...",66.115078,17.568737,0.986454,0.986361
3,SVR(),540.223961,207.068477,0.095574,0.089369
4,KNeighborsRegressor(),261.434244,109.842806,0.788188,0.786735
5,"XGBRegressor(base_score=None, booster=None, ca...",55.015856,16.538079,0.99062,0.990556


### Final model

In [17]:
final_model=XGBRegressor(random_state=123)
final_model.fit(X_train,Y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=123, ...)

In [18]:
# Predicted demand values are stored in a new column called "Predicted_Demand" which is added to the dataframe.
X_test["Predicted_Demand"] = final_model.predict(X_test)

In [19]:
# Adding the target varible back to the dataframe.
X_test['Actual_Demand']= Y_test

In [None]:
X_test.head()