In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import math

In [4]:
traindf=pd.read_csv("../input/BlackFriday.csv")

In [5]:
traindf.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [6]:
traindf.shape

(550068, 12)

In [7]:
traindf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [8]:
traindf['City_Category'].value_counts()

B    231173
C    171175
A    147720
Name: City_Category, dtype: int64

In [9]:
traindf['Age'].value_counts()

26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
Name: Age, dtype: int64

In [10]:
traindf.isna().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [11]:
traindf.nunique(axis=0)

User_ID                        5891
Product_ID                     3631
Gender                            2
Age                               7
Occupation                       21
City_Category                     3
Stay_In_Current_City_Years        5
Marital_Status                    2
Product_Category_1               20
Product_Category_2               17
Product_Category_3               15
Purchase                      18105
dtype: int64

In [14]:
# PREPROCESSING

In [10]:
# Filling_the_Null_values

traindf['Product_Category_2'].fillna(value=traindf['Product_Category_2'].mean(),inplace=True);
traindf['Product_Category_3'].fillna(value=traindf['Product_Category_3'].mean(),inplace=True);

In [11]:
traindf.isnull().sum(axis=0)

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [12]:
traindf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,9.842144,12.66984,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,9.842144,12.66984,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,12.66984,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,9.842144,12.66984,7969


In [13]:
traindf=traindf.astype({"Age": str})
traindf['Age'].dtype

dtype('O')

In [14]:
#Setting_UserID_as_index

traindf.set_index("User_ID",inplace=True)
traindf.head()

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000001,P00069042,F,0-17,10,A,2,0,3,9.842144,12.66984,8370
1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
1000001,P00087842,F,0-17,10,A,2,0,12,9.842144,12.66984,1422
1000001,P00085442,F,0-17,10,A,2,0,12,14.0,12.66984,1057
1000002,P00285442,M,55+,16,C,4+,0,8,9.842144,12.66984,7969


In [15]:
# Changing_the_Gender_and_CityCategory_into_Numerical_Data_type_for_training

dic1={'F':1,'M':2}
dic2={'A':1,'B':2,'C':3}
traindf=traindf.replace({'Gender':dic1,'City_Category':dic2})

In [16]:
# Changing_the_Object_into_Numerical_Data_type_for_training

df1=traindf['Stay_In_Current_City_Years'].unique()
df1=df1.tolist()

li_1=list(range(1,6))
dic3=dict(zip(df1,li_1))
traindf=traindf.replace({'Stay_In_Current_City_Years':dic3})

In [17]:
traindf.head()

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000001,P00069042,1,0-17,10,1,1,0,3,9.842144,12.66984,8370
1000001,P00248942,1,0-17,10,1,1,0,1,6.0,14.0,15200
1000001,P00087842,1,0-17,10,1,1,0,12,9.842144,12.66984,1422
1000001,P00085442,1,0-17,10,1,1,0,12,14.0,12.66984,1057
1000002,P00285442,2,55+,16,3,2,0,8,9.842144,12.66984,7969


In [18]:
# Changing_the_Age_into_Numerical_Data_type_for_training

df2=traindf['Age'].unique()
df2=df2.tolist()

li_2=list(range(1,8))
dic4=dict(zip(df2,li_2))
traindf=traindf.replace({'Age':dic4})

In [19]:
traindf.head()

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000001,P00069042,1,1,10,1,1,0,3,9.842144,12.66984,8370
1000001,P00248942,1,1,10,1,1,0,1,6.0,14.0,15200
1000001,P00087842,1,1,10,1,1,0,12,9.842144,12.66984,1422
1000001,P00085442,1,1,10,1,1,0,12,14.0,12.66984,1057
1000002,P00285442,2,2,16,3,2,0,8,9.842144,12.66984,7969


In [20]:
# Using_labelEncoder_on_Product_ID

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
traindf['Product_ID']=le.fit_transform(traindf['Product_ID'])

# Splitting_the_Training_Dataset

Ytr=traindf['Purchase']
Xtr=traindf.drop(columns=['Purchase'])

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(Xtr,Ytr,random_state=0)

In [21]:
# 1_Using_Linear_Regression

In [22]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
Rid=Ridge().fit(X_train,y_train)

In [23]:
y_pred=Rid.predict(X_test)

In [24]:
Rid.score(X_test,y_test)

0.11253796337175792

In [25]:
# 2_Using_polynomial_Regression_with_different_Degrees

In [26]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly=PolynomialFeatures(degree=2)
X_poly=poly.fit_transform(Xtr)
X_train,X_test,y_train,y_test=train_test_split(X_poly,Ytr,random_state=0)
linreg=LinearRegression().fit(X_train,y_train)

In [27]:
y_pred=linreg.predict(X_test)
math.sqrt(mean_squared_error(y_test,y_pred))

4268.372716166508

In [28]:
linreg.score(X_test,y_test)

0.26463627579497684

In [29]:
# 3_Using_Lasso_Regression

In [30]:
from sklearn.linear_model import Lasso
linlasso=Lasso(alpha=1,max_iter=100).fit(X_train,y_train)



In [31]:
y_pred=linlasso.predict(X_test)
math.sqrt(mean_squared_error(y_test,y_pred))

4273.0293906490215

In [32]:
# 4_Using_Boosted_Gradient_Descent

In [33]:
import xgboost as xgb

xgb_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.05,
                max_depth = 10, alpha = 10, n_estimators = 1000)

In [34]:
# Training_the_model

xgb_reg.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.05, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [35]:
# Finding_the_RMSE_for_XGB_Model

predictions_xgb=xgb_reg.predict(X_test)
math.sqrt(mean_squared_error(y_test,predictions_xgb))

2586.0451059251827