#                          Black Friday purchase prediction
 To predict the purchase amount of customer against various products. The dataset contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month. 

In [1]:
# import statements  
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from autograd import numpy as np
from sklearn.model_selection import train_test_split

# Import the dataset and data preview

In [2]:
# import the dataset
datapath = 'Data/'
train_csv = datapath + "train.csv"
test_csv = datapath +  "test.csv"
#converting csv to pandas dataframe
train_data = pd.read_csv(train_csv)
test_data= pd.read_csv(test_csv)  
# train_data=train_data[0:10000]
# test_data=test_data[0:10000]
test_user_id = np.array(test_data["User_ID"])
test_product_id = np.array(test_data["Product_ID"])

In [3]:
#preview of data
display(train_data[0:10])

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871


# Implementation of handling null values

In [4]:
# checking null values in data
train_null=train_data.isnull().sum()
test_null=test_data.isnull().sum()
print("\tNull values in Training data")
print(train_null)
print("\n\tNull values in Testing data")
print(test_null)

#Filling the null values with mean in both training and test data
train_data['Product_Category_2'].fillna(round(train_data['Product_Category_2'].mean()),inplace=True)
train_data['Product_Category_3'].fillna(round(train_data['Product_Category_3'].mean()),inplace=True)

test_data['Product_Category_2'].fillna(round(test_data['Product_Category_2'].mean()),inplace=True)
test_data['Product_Category_3'].fillna(round(test_data['Product_Category_3'].mean()),inplace=True)

	Null values in Training data
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

	Null values in Testing data
User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2             72344
Product_Category_3            162562
dtype: int64


# Implementation of converting categorical to numerical values

In [5]:
#converting categorical columns into numerical values
label=LabelEncoder()
#Gender column
train_gen_dumm=pd.get_dummies(train_data['Gender'],prefix='Gender',drop_first=True)
train_data=pd.concat([train_data,train_gen_dumm],axis='columns')
train_data.drop(['Gender'], axis=1,inplace=True)
test_gen_dumm=pd.get_dummies(test_data['Gender'],prefix='Gender',drop_first=True)
test_data=pd.concat([test_data,test_gen_dumm],axis='columns')
test_data.drop(['Gender'], axis=1,inplace=True)

# Age column
train_age_encoded =pd.DataFrame(label.fit_transform(train_data['Age']))
train_data=pd.concat([train_data,train_age_encoded],axis='columns')
train_data.drop(['Age'], axis=1,inplace=True)
train_data.rename(columns = {0:'Age_Encoded'}, inplace = True)
test_age_encoded =pd.DataFrame(label.fit_transform(test_data['Age']))
test_data=pd.concat([test_data,test_age_encoded],axis='columns')
test_data.drop(['Age'], axis=1,inplace=True)
test_data.rename(columns = {0:'Age_Encoded'}, inplace = True)

#City column
train_city_dumm=pd.get_dummies(train_data['City_Category'],prefix='City_Category',drop_first=True)
train_data=pd.concat([train_data,train_city_dumm],axis='columns')
train_data.drop(['City_Category'], axis=1,inplace=True)
test_city_dumm=pd.get_dummies(test_data['City_Category'],prefix='City_Category',drop_first=True)
test_data=pd.concat([test_data,test_city_dumm],axis='columns')
test_data.drop(['City_Category'], axis=1,inplace=True)
                     
#Stay_In_Current_City_Years column
train_stay_encoded =pd.DataFrame(label.fit_transform(train_data['Stay_In_Current_City_Years']))
train_data=pd.concat([train_data,train_stay_encoded],axis='columns')
train_data.drop(['Stay_In_Current_City_Years'], axis=1,inplace=True)
train_data.rename(columns = {0:'Stay_In_Current_City_Encoded'}, inplace = True)
test_stay_encoded =pd.DataFrame(label.fit_transform(test_data['Stay_In_Current_City_Years']))
test_data=pd.concat([test_data,test_stay_encoded],axis='columns')
test_data.drop(['Stay_In_Current_City_Years'], axis=1,inplace=True) 
test_data.rename(columns = {0:'Stay_In_Current_City_Encoded'}, inplace = True)

#combining user and product id before converting to numerical value
for col_name in ["User_ID", "Product_ID"]:
    combined_data = pd.concat((train_data[col_name],test_data[col_name]),axis=0)
    label.fit(combined_data)
    train_data[col_name] =pd.DataFrame(label.transform(train_data[col_name]))
    test_data[col_name] =pd.DataFrame(label.transform(test_data[col_name]))

# Extracting new features from the dataset

In [6]:
#Extracting new features from the dataset and adding back to the dataframe

#To get purchase summary from training data to fit into testing data
def get_purchase_values(col_name):
    min_purchase=[]
    max_purchase=[]
    mean_purchase=[]
    purchase_min = train_data.groupby(col_name)['Purchase'].min()
    purchase_max = train_data.groupby(col_name)['Purchase'].max()
    purchase_mean = train_data.groupby(col_name)['Purchase'].mean()
    for row_index,row in test_data.iterrows():
        value = row[col_name]
        min_purchase.append(purchase_min.get(value,0))
        max_purchase.append(purchase_max.get(value,0))
        mean_purchase.append(purchase_mean.get(value,0))
    return min_purchase,max_purchase,mean_purchase

#To get record count from training data to fit into testing data
def get_rowcount(col_name):
    record_count=[]
    train_count = train_data.groupby(col_name)[col_name].count()
    for row_index,row in test_data.iterrows():
        value = row[col_name]
        record_count.append(train_count.get(value,0))
    return record_count

#Number of rows per age group
train_data['Age_Cnt'] = train_data.groupby('Age_Encoded')['Age_Encoded'].transform('count')
test_data['Age_Cnt'] = get_rowcount('Age_Encoded')

#Number of rows per gender
train_data['Gender_Cnt'] = train_data.groupby('Gender_M')['Gender_M'].transform('count')
test_data['Gender_Cnt'] = get_rowcount('Gender_M')

#Number of rows per Marital status
train_data['Marital_Cnt'] = train_data.groupby('Marital_Status')['Marital_Status'].transform('count')
test_data['Marital_Cnt'] = get_rowcount('Marital_Status')

#Number of rows per Occupation group
train_data['Occupation_Cnt'] = train_data.groupby('Occupation')['Occupation'].transform('count')
test_data['Occupation_Cnt'] = get_rowcount('Occupation')

#Number of rows per Stay_In_Current_City group
train_data['Stay_In_Current_City_Cnt'] = train_data.groupby('Stay_In_Current_City_Encoded')['Stay_In_Current_City_Encoded'].transform('count')
test_data['Stay_In_Current_City_Cnt'] = get_rowcount('Stay_In_Current_City_Encoded')

#Number of rows per City_Category group
train_data['City_Category_Cnt'] = train_data.groupby(['City_Category_B','City_Category_C'])['City_Category_B'].transform('count')
test_data['City_Category_Cnt'] = get_rowcount(['City_Category_B','City_Category_C'])

#Number of rows per Product_Category_1 group
train_data['Product_Category_1_Cnt'] = train_data.groupby('Product_Category_1')['Product_Category_1'].transform('count')
test_data['Product_Category_1_Cnt'] = get_rowcount('Product_Category_1')

#Number of rows per Product_Category_2 group
train_data['Product_Category_2_Cnt'] = train_data.groupby('Product_Category_2')['Product_Category_2'].transform('count')
test_data['Product_Category_2_Cnt'] = get_rowcount('Product_Category_2')

#Number of rows per Product_Category_3 group
train_data['Product_Category_3_Cnt'] = train_data.groupby('Product_Category_3')['Product_Category_3'].transform('count')
test_data['Product_Category_3_Cnt'] = get_rowcount('Product_Category_3')

#Number of rows per User_ID group
train_data['User_ID_Cnt'] = train_data.groupby('User_ID')['User_ID'].transform('count')
test_data['User_ID_Cnt'] = get_rowcount('User_ID')

#Number of rows per Product_ID group
train_data['Product_ID_Cnt'] = train_data.groupby('Product_ID')['Product_ID'].transform('count')
test_data['Product_ID_Cnt'] = get_rowcount('Product_ID')

#Min purchase price per Product_ID 
train_data['Product_min_price'] = train_data.groupby('Product_ID')['Purchase'].transform('min')

#Max purchase price per Product_ID 
train_data['Product_max_price'] = train_data.groupby('Product_ID')['Purchase'].transform('max')

#Mean purchase price per Product_ID 
train_data['Product_mean_price'] = train_data.groupby('Product_ID')['Purchase'].transform('mean')

#Min purchase price per User_ID 
train_data['User_min_price'] = train_data.groupby('User_ID')['Purchase'].transform('min')

#Max purchase price per User_ID 
train_data['User_max_price'] = train_data.groupby('User_ID')['Purchase'].transform('max')

#Mean purchase price per User_ID 
train_data['User_mean_price'] = train_data.groupby('User_ID')['Purchase'].transform('mean')

test_data['Product_min_price'],test_data['Product_max_price'],test_data['Product_mean_price']=get_purchase_values('Product_ID')
test_data['User_min_price'],test_data['User_max_price'],test_data['User_mean_price']=get_purchase_values('User_ID')

In [7]:
#input and output columns
train_y = np.array(train_data["Purchase"])
train_data.drop(["Purchase"], axis=1, inplace=True)
train_x = np.array(train_data).astype('float')
test_x = np.array(test_data).astype('float')

#splitting training data into train/validation dataset
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1)

# First XGB model

In [8]:
# Build a XGB model - 1st
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.05
params["min_child_weight"] = 10
params["subsample"] = 0.8
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 10
params["early_stopping_rounds"] = 10
params["seed"] = 0
num_rounds = 1000
par_list_1 = list(params.items())

xgb_train_1=xgb.DMatrix(data=train_x,label=train_y)
model_1=xgb.train(par_list_1, xgb_train_1, num_rounds)
#predicting y label for validation dataset
xgb_val_1=xgb.DMatrix(val_x)
y_val_1=model_1.predict(xgb_val_1)
#predicting y label for test dataset
xgb_test_1=xgb.DMatrix(test_x)
y_pred_1=model_1.predict(xgb_test_1)

# Second XGB model

In [9]:
# Build a XGB model - 2nd 
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.05
params["min_child_weight"] = 10
params["subsample"] = 0.8
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 10
params["early_stopping_rounds"] = 10
params["seed"] = 100
num_rounds = 1500
par_list_2 = list(params.items())

xgb_train_2=xgb.DMatrix(data=train_x,label=train_y)
model_2=xgb.train(par_list_2, xgb_train_2, num_rounds)
#predicting y label for validation dataset
xgb_val_2=xgb.DMatrix(val_x)
y_val_2=model_2.predict(xgb_val_2)
#predicting y label for test dataset
xgb_test_2=xgb.DMatrix(test_x)
y_pred_2=model_2.predict(xgb_test_2)

# RMSE score of validation dataset by taking average of 2 models

In [10]:
#Taking average of 2 models for validation dataset prediction and calculatin RMSE score
y_val=(y_val_1+y_val_2)/2
y_diff=val_y-y_val
RMSE=(((np.sum(y_diff**2))/y_diff.size)**0.5)
print(f"The RMSE score of validation dataset is :{RMSE}")

The RMSE score of validation dataset is :2394.438411752429


# RMSE score of Test dataset by taking average of 2 models

In [11]:
#Taking average of 2 models for test dataset prediction
final_data = pd.DataFrame({"User_ID":test_user_id})
final_data["Product_ID"] = test_product_id
final_data["Purchase"] =(y_pred_1+y_pred_2)/2
final_data.to_csv("black_friday_pred.csv", index=False)

# The RMSE score for the test data by submitting in contest is 2467