# Data Processing

### Cleaning and preparing the data for model training

In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


## Problem Statement

A retail company "ABC Private Limited" wants to understand the customer purchase behaviour (specially, purchase amount) against various products of different categories. They have shared purchase summary of various customers for the selected high volume products from the last month. The dataset aslo contains customer demographics (age, gender, marital status, ciaty_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.\
Now, they want to build a model to predict the purchase amount of customer against the products which will help them to create personalized offer for customers against different products.

In [2]:
#importing the train dataset
df_train = pd.read_csv("Datasets/Black Friday/train.csv")

In [3]:
df_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
#importing the test dataset
df_test = pd.read_csv("Datasets/Black Friday/test.csv")

In [5]:
df_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [6]:
df_train.shape

(550068, 12)

In [7]:
df_test.shape

(233599, 11)

In [8]:
#merging the test data and train data
df = pd.concat([df_train,df_test])

In [9]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370.0
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422.0
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057.0
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969.0


In [10]:
df.shape

(783667, 12)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   City_Category               783667 non-null  object 
 6   Stay_In_Current_City_Years  783667 non-null  object 
 7   Marital_Status              783667 non-null  int64  
 8   Product_Category_1          783667 non-null  int64  
 9   Product_Category_2          537685 non-null  float64
 10  Product_Category_3          237858 non-null  float64
 11  Purchase                    550068 non-null  float64
dtypes: float64(3), int64(4), object(5)
memory usage: 77.7+ MB


In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
User_ID,783667.0,1003029.0,1727.266668,1000001.0,1001519.0,1003075.0,1004478.0,1006040.0
Occupation,783667.0,8.0793,6.522206,0.0,2.0,7.0,14.0,20.0
Marital_Status,783667.0,0.4097774,0.491793,0.0,0.0,0.0,1.0,1.0
Product_Category_1,783667.0,5.366196,3.87816,1.0,1.0,5.0,8.0,20.0
Product_Category_2,537685.0,9.844506,5.089093,2.0,5.0,9.0,15.0,18.0
Product_Category_3,237858.0,12.6686,4.12551,3.0,9.0,14.0,16.0,18.0
Purchase,550068.0,9263.969,5023.065394,12.0,5823.0,8047.0,12054.0,23961.0


In [13]:
#Checking for missing values
(df.isnull().sum()/len(df))*100

User_ID                        0.000000
Product_ID                     0.000000
Gender                         0.000000
Age                            0.000000
Occupation                     0.000000
City_Category                  0.000000
Stay_In_Current_City_Years     0.000000
Marital_Status                 0.000000
Product_Category_1             0.000000
Product_Category_2            31.388587
Product_Category_3            69.648078
Purchase                      29.808452
dtype: float64

In [14]:
#dropping irrelevant features
df.drop(['User_ID','Product_ID'], axis = 1, inplace = True)

In [15]:
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,F,0-17,10,A,2,0,3,,,8370.0
1,F,0-17,10,A,2,0,1,6.0,14.0,15200.0
2,F,0-17,10,A,2,0,12,,,1422.0
3,F,0-17,10,A,2,0,12,14.0,,1057.0
4,M,55+,16,C,4+,0,8,,,7969.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      783667 non-null  object 
 1   Age                         783667 non-null  object 
 2   Occupation                  783667 non-null  int64  
 3   City_Category               783667 non-null  object 
 4   Stay_In_Current_City_Years  783667 non-null  object 
 5   Marital_Status              783667 non-null  int64  
 6   Product_Category_1          783667 non-null  int64  
 7   Product_Category_2          537685 non-null  float64
 8   Product_Category_3          237858 non-null  float64
 9   Purchase                    550068 non-null  float64
dtypes: float64(3), int64(3), object(4)
memory usage: 65.8+ MB


In [17]:
#Handling categorical feature : Stay_In_Current_City
df['Stay_In_Current_City_Years'].unique()


array(['2', '4+', '3', '1', '0'], dtype=object)

In [18]:
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+','')

  df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+','')


In [19]:
df['Stay_In_Current_City_Years'].unique()

array(['2', '4', '3', '1', '0'], dtype=object)

In [20]:
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].astype(int)

In [21]:
# Replacing the missing values in the Product_Categor_3 and _2


In [22]:
df['Product_Category_2'].value_counts()

8.0     91317
14.0    78834
2.0     70498
16.0    61687
15.0    54114
5.0     37165
4.0     36705
6.0     23575
11.0    20230
17.0    19104
13.0    15054
9.0      8177
12.0     7801
10.0     4420
3.0      4123
18.0     4027
7.0       854
Name: Product_Category_2, dtype: int64

In [23]:
df['Product_Category_3'].value_counts()

16.0    46469
15.0    39968
14.0    26283
17.0    23818
5.0     23799
8.0     17861
9.0     16532
12.0    13115
13.0     7849
6.0      6888
18.0     6621
4.0      2691
11.0     2585
10.0     2501
3.0       878
Name: Product_Category_3, dtype: int64

In [24]:
## Replacing the missing value in the category_2 and _3 column by the mode

df['Product_Category_2'] = df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])
df['Product_Category_3'] = df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      783667 non-null  object 
 1   Age                         783667 non-null  object 
 2   Occupation                  783667 non-null  int64  
 3   City_Category               783667 non-null  object 
 4   Stay_In_Current_City_Years  783667 non-null  int64  
 5   Marital_Status              783667 non-null  int64  
 6   Product_Category_1          783667 non-null  int64  
 7   Product_Category_2          783667 non-null  float64
 8   Product_Category_3          783667 non-null  float64
 9   Purchase                    550068 non-null  float64
dtypes: float64(3), int64(4), object(3)
memory usage: 65.8+ MB


# Feature Encoding

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Age'] = le.fit_transform(df['Age'])
df['City_Category'] = le.fit_transform(df['City_Category'])


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 10 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      783667 non-null  int64  
 1   Age                         783667 non-null  int64  
 2   Occupation                  783667 non-null  int64  
 3   City_Category               783667 non-null  int64  
 4   Stay_In_Current_City_Years  783667 non-null  int64  
 5   Marital_Status              783667 non-null  int64  
 6   Product_Category_1          783667 non-null  int64  
 7   Product_Category_2          783667 non-null  float64
 8   Product_Category_3          783667 non-null  float64
 9   Purchase                    550068 non-null  float64
dtypes: float64(3), int64(7)
memory usage: 65.8 MB


# Seperating test and train data

In [29]:
df_test = df[df['Purchase'].isnull()]
df_train = df[~df['Purchase'].isnull()]

# Seperating train data into X and y

In [30]:
y = df_train['Purchase']
X = df_train.drop('Purchase',axis = 1)

# Splitting the data using train test split

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [32]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", y_train.shape)
print("Y_test shape:", y_test.shape)

X_train shape: (368545, 9)
X_test shape: (181523, 9)
Y_train shape: (368545,)
Y_test shape: (181523,)


# Model Building

In [33]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

  from pandas import MultiIndex, Int64Index


# Feature Scaling

In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Linear Regression

In [35]:
model =LinearRegression()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("Linear Regression Score on Training data is",model.score(X_train, y_train))#Training Accuracy
print("Linear Regression Score on Test data is",model.score(X_test, y_test))#Testing Accuracy

accuracies = cross_val_score(model, X_train, y_train)
print(accuracies)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

print("Mean Absolute Error:" , mean_absolute_error(y_pred, y_test))

print("Mean Squared Error:" , mean_squared_error(y_pred, y_test))

print('RMSE(Root Mean Squared Error):', np.sqrt(mean_squared_error(y_test, y_pred)))

print('The r2_score is :',r2_score(y_test, y_pred))

Linear Regression Score on Training data is 0.1317873239349333
Linear Regression Score on Test data is 0.12922937601186446
[0.13181721 0.13052848 0.1303529  0.1357904  0.13019465]
Accuracy: 13.17 %
Standard Deviation: 0.21 %
Mean Absolute Error: 3578.375623356969
Mean Squared Error: 21945673.391874712
RMSE(Root Mean Squared Error): 4684.6209443107255
The r2_score is : 0.12922937601186446


## KNN Regression

In [36]:
model =KNeighborsRegressor()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("K Nearest Neighbours Score on Training data is",model.score(X_train, y_train))#Training Accuracy
print("K Nearest Neighbours Score on Test data is",model.score(X_test, y_test))#Testing Accuracy

accuracies = cross_val_score(model, X_train, y_train)
print(accuracies)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

print("Mean Absolute Error:" , mean_absolute_error(y_pred, y_test))

print("Mean Squared Error:" , mean_squared_error(y_pred, y_test))

print('RMSE(Root Mean Squared Error):', np.sqrt(mean_squared_error(y_test, y_pred)))

print('The r2_score is :',r2_score(y_test, y_pred))

K Nearest Neighbours Score on Training data is 0.6473510938849927
K Nearest Neighbours Score on Test data is 0.5066601046556447
[0.49296232 0.49583462 0.49598163 0.48951278 0.49681274]
Accuracy: 49.42 %
Standard Deviation: 0.27 %
Mean Absolute Error: 2562.9169229243676
Mean Squared Error: 12433442.190347008
RMSE(Root Mean Squared Error): 3526.1086469856555
The r2_score is : 0.5066601046556447


## Decision Tree Regression

In [37]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("Decision Tree Regressor Score on Training data is",model.score(X_train, y_train))#Training Accuracy
print("Decision Tree Regressor Score on Test data is",model.score(X_test, y_test))#Testing Accuracy

accuracies = cross_val_score(model, X_train, y_train)
print(accuracies)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

print("Mean Absolute Error:" , mean_absolute_error(y_pred, y_test))

print("Mean Squared Error:" , mean_squared_error(y_pred, y_test))

print('RMSE(Root Mean Squared Error):', np.sqrt(mean_squared_error(y_test, y_pred)))

print('The r2_score is :',r2_score(y_test, y_pred))


Decision Tree Regressor Score on Training data is 0.7983631647792537
Decision Tree Regressor Score on Test data is 0.5584921741689993
[0.54595248 0.54964735 0.54262777 0.54208949 0.55703938]
Accuracy: 54.75 %
Standard Deviation: 0.55 %
Mean Absolute Error: 2361.615738240066
Mean Squared Error: 11127139.890488386
RMSE(Root Mean Squared Error): 3335.736783753836
The r2_score is : 0.5584921741689993


## Random Forest Regressor

In [38]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("Random Forest Score on Training data is",model.score(X_train, y_train))#Training Accuracy
print("Random Forest Score on Test data is",model.score(X_test, y_test))#Testing Accuracy

accuracies = cross_val_score(model, X_train, y_train)
print(accuracies)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

print("Mean Absolute Error:" , mean_absolute_error(y_pred, y_test))

print("Mean Squared Error:" , mean_squared_error(y_pred, y_test))

print('RMSE(Root Mean Squared Error):', np.sqrt(mean_squared_error(y_test, y_pred)))

print('The r2_score is :',r2_score(y_test, y_pred))

Random Forest Score on Training data is 0.787575055166497
Random Forest Score on Test data is 0.6293128935787033
[0.62889153 0.62988368 0.62659654 0.62294302 0.63408508]
Accuracy: 62.85 %
Standard Deviation: 0.37 %
Mean Absolute Error: 2226.014807472367
Mean Squared Error: 9342274.468151698
RMSE(Root Mean Squared Error): 3056.5134496925903
The r2_score is : 0.6293128935787033


# XG Boost Regressor

In [39]:
model = XGBRegressor()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

print("XG Boost Regressor Score on Training data is",model.score(X_train, y_train))#Training Accuracy
print("XG Boost Regressor Score on Test data is",model.score(X_test, y_test))#Testing Accuracy

accuracies = cross_val_score(model, X_train, y_train)
print(accuracies)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

print("Mean Absolute Error:" , mean_absolute_error(y_pred, y_test))

print("Mean Squared Error:" , mean_squared_error(y_pred, y_test))

print('RMSE(Root Mean Squared Error):', np.sqrt(mean_squared_error(y_test, y_pred)))

print('The r2_score is :',r2_score(y_test, y_pred))


XG Boost Regressor Score on Training data is 0.6801109792717592
XG Boost Regressor Score on Test data is 0.6670843166939937
[0.67028032 0.67136253 0.67030149 0.6673104  0.6769282 ]
Accuracy: 67.12 %
Standard Deviation: 0.32 %
Mean Absolute Error: 2164.944350842329
Mean Squared Error: 8390336.848301806
RMSE(Root Mean Squared Error): 2896.6078174826853
The r2_score is : 0.6670843166939937
