### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import GridSearchCV

### Importing the data

In [2]:
data = pd.read_csv('Final_data.csv')

In [3]:
# First 5 rows of the data
data.head()

Unnamed: 0,Date,Customer_Name,Item_Name,Vrh_No,Quantity,Price_per_unit,Amount
0,2019-01-04,customer1,Item_1,1,200.0,20.0,4000.0
1,2019-01-04,customer1,Item_2,1,160.0,28.0,4480.0
2,2019-01-04,customer1,Item_3,1,12.0,60.0,720.0
3,2019-01-04,customer1,Item_3,1,15.0,35.0,525.0
4,2019-01-04,customer1,Item_3,1,25.0,25.0,625.0


In [4]:
# This will remove "customer" from all the values in 'Customer_Name' column.
data['Customer_Name']=data['Customer_Name'].apply(lambda x:x.lstrip('customer'))

# This will remove "Item_" from all the values in 'Item_Name' column.
data['Item_Name']=data['Item_Name'].apply(lambda x:x.lstrip('Item_'))

In [5]:
data.head()

Unnamed: 0,Date,Customer_Name,Item_Name,Vrh_No,Quantity,Price_per_unit,Amount
0,2019-01-04,1,1,1,200.0,20.0,4000.0
1,2019-01-04,1,2,1,160.0,28.0,4480.0
2,2019-01-04,1,3,1,12.0,60.0,720.0
3,2019-01-04,1,3,1,15.0,35.0,525.0
4,2019-01-04,1,3,1,25.0,25.0,625.0


In [6]:
# Making a list with all the independent columns used for model building
indep_var = ['Customer_Name','Item_Name','Price_per_unit','Quantity']

In [7]:
# Converting the data type of the 'Customer_Name' and 'Item_Name' columns in the 'data' DataFrame to int
data['Customer_Name'] = data['Customer_Name'].astype('int')
data['Item_Name'] = data['Item_Name'].astype('int')

### Train - Test split of the data

In [8]:
# Assigning the dependent column to y and independent columns to x
y = data.Amount
x = data[indep_var]

In [9]:
# Train-test split on the data
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3,random_state=123)

In [10]:
# A function to calculate and print out four evaluation metrics,
# namely Root Mean Squared Error (RMSE), Mean Absolute Error (MAE), R-squared (R^2) Score, and Adjusted R-squared (R^2) Score

def model_result(actual,pred):
    rmse = np.sqrt(mean_squared_error(actual,pred))
    mae = mean_absolute_error(actual,pred)
    r2 = r2_score(actual,pred)
    adj_r2 = 1-(((1-r2)*(len(actual)-1))/(len(actual)-4-1))
    
    print("Evaluation Metrics")
    print("**********************************************************")
    print("Root Mean Squared Error (RMSE):", rmse)
    print("Mean Absolute Error (MAE):", mae)
    print("R-squared (R^2) Score:", r2)
    print("Adjusted (R^2) Score:", adj_r2)

    metrics = [rmse, mae, r2, adj_r2]
    return metrics

In [11]:
# Inside the function, the input X and Y data are fitted to the model and predicted values are generated using the testing data. 
# The function then calls on the "model_result" function to calculate and print evaluation metrics
# Finally, the evaluation metrics are appended to a dictionary with the model name as key.

validation_dict = {}
def different_model(model):
    ml = model
    ml.fit(X_train, Y_train)
    pred_y = ml.predict(X_test)
    return_list = model_result(Y_test, pred_y)
    validation_dict[model] = return_list

#### LinearRegression()

In [12]:
different_model(LinearRegression())

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 5454.589095775174
Mean Absolute Error (MAE): 2404.012960898909
R-squared (R^2) Score: 0.8002224371692925
Adjusted (R^2) Score: 0.7988517506318605


#### DecisionTreeRegressor()

In [13]:
different_model(DecisionTreeRegressor(random_state=123))

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 2921.0657689230516
Mean Absolute Error (MAE): 809.7311180272109
R-squared (R^2) Score: 0.9427065068726128
Adjusted (R^2) Score: 0.9423134125801436


#### RandomForestRegressor()

In [14]:
different_model(RandomForestRegressor(random_state=123))

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 2278.3449220160537
Mean Absolute Error (MAE): 489.84719501020396
R-squared (R^2) Score: 0.9651452816901696
Adjusted (R^2) Score: 0.9649061412557968


### SVR()

In [15]:
different_model(SVR())

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 13019.067978369025
Mean Absolute Error (MAE): 6025.804450587838
R-squared (R^2) Score: -0.13810523260833074
Adjusted (R^2) Score: -0.14591384483891967


#### KNeighborsRegressor()

In [16]:
different_model(KNeighborsRegressor())

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 4597.179434319154
Mean Absolute Error (MAE): 1621.2554719047619
R-squared (R^2) Score: 0.8580924372395642
Adjusted (R^2) Score: 0.8571188004453245


#### XGBRegressor()

In [17]:
different_model(XGBRegressor(random_state=123))

Evaluation Metrics
**********************************************************
Root Mean Squared Error (RMSE): 2272.6610228469413
Mean Absolute Error (MAE): 530.4211622738677
R-squared (R^2) Score: 0.9653189723234606
Adjusted (R^2) Score: 0.9650810235915461


In [18]:
# Creating a dataframe with model name and its performance through various evaluation metrics

rmse = []
mae = []
r2_Score = []
adjusted_r2score=[]
model = []
col = ['Algorithm', 'RMSE', 'MAE', 'R2_SCORE', 'ADJUSTED_R2_SCORE']
df_results = pd.DataFrame(columns=col)


for model_name, scores in validation_dict.items():
    model.append(model_name)
    rmse.append(scores[0])
    mae.append(scores[1])
    r2_Score.append(scores[2])
    adjusted_r2score.append(scores[3])

df_results['Algorithm'] = model
df_results['RMSE'] = rmse
df_results['MAE'] = mae
df_results['R2_SCORE'] = r2_Score
df_results['ADJUSTED_R2_SCORE'] = adjusted_r2score

df_results.reset_index(drop=True, inplace=True)

df_results

Unnamed: 0,Algorithm,RMSE,MAE,R2_SCORE,ADJUSTED_R2_SCORE
0,LinearRegression(),5454.589096,2404.012961,0.800222,0.798852
1,DecisionTreeRegressor(random_state=123),2921.065769,809.731118,0.942707,0.942313
2,"(DecisionTreeRegressor(max_features='auto', ra...",2278.344922,489.847195,0.965145,0.964906
3,SVR(),13019.067978,6025.804451,-0.138105,-0.145914
4,KNeighborsRegressor(),4597.179434,1621.255472,0.858092,0.857119
5,"XGBRegressor(base_score=None, booster=None, ca...",2272.661023,530.421162,0.965319,0.965081


### Final model

In [19]:
final_model=RandomForestRegressor(random_state=123)
final_model.fit(X_train,Y_train)

RandomForestRegressor(random_state=123)

In [20]:
# Predicted sales values are stored in a new column called "Predicted_Sales" which is added to the dataframe "x".
x["Predicted_Sales"] = final_model.predict(x)

In [21]:
# Adding the target varible back to the dataframe "x".
x['Actual_Sales']= data['Amount']

In [22]:
x.head()

Unnamed: 0,Customer_Name,Item_Name,Price_per_unit,Quantity,Predicted_Sales,Actual_Sales
0,1,1,20.0,200.0,3993.78,4000.0
1,1,2,28.0,160.0,4497.5,4480.0
2,1,3,60.0,12.0,718.92,720.0
3,1,3,35.0,15.0,516.465,525.0
4,1,3,25.0,25.0,628.61,625.0


In [23]:
# Swapping the positions of last two columns
cols = x.columns.tolist()  # Get the list of column names
cols[-2], cols[-1] = cols[-1], cols[-2]  # Swap the positions of the last two columns
x = x[cols]

In [24]:
x.head()

Unnamed: 0,Customer_Name,Item_Name,Price_per_unit,Quantity,Actual_Sales,Predicted_Sales
0,1,1,20.0,200.0,4000.0,3993.78
1,1,2,28.0,160.0,4480.0,4497.5
2,1,3,60.0,12.0,720.0,718.92
3,1,3,35.0,15.0,525.0,516.465
4,1,3,25.0,25.0,625.0,628.61
