In [206]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.utils import resample
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import warnings
%matplotlib inline

# Data upload

In [207]:
# from google.colab import files
# files.upload()

In [208]:
train = pd.read_csv('train_8wry4cB.csv')
test = pd.read_csv('test_Yix80N0.csv')
sub = pd.read_csv('sample_submission_opxHi4g.csv')

In [209]:
train.head(2)

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male


In [210]:
test.head(2)

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002/B00003/C00006/D19956/
1,u19725,19/12/14 13:52,19/12/14 13:52,A00002/B00005/C00067/D02026/


In [211]:
train['label']='train'
test['label']='test'
full_data = pd.concat([train,test],sort=False)

In [212]:
train.shape, test.shape, full_data.shape

((10500, 6), (4500, 5), (15000, 6))

In [213]:
full_data.reset_index(inplace=True)

full_data.drop('index',axis=1,inplace=True)

# Feature Engineering 

In [214]:
full_data['Product_Number'] = full_data['ProductList'].apply(lambda x: len(x.split(';')))

In [215]:
full_data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,label,Product_Number
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,train,4
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,train,7
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female,train,1
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female,train,3
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male,train,2


In [216]:
full_data.count()

session_id        15000
startTime         15000
endTime           15000
ProductList       15000
gender            10500
label             15000
Product_Number    15000
dtype: int64

In [217]:
full_data_2=pd.DataFrame()
m=[]
for i in range(len(full_data)):
    a=full_data.iloc[[i]]
    arr=full_data.iloc[i]['ProductList'].split(';')
    for j in range(len(arr)):
        full_data_2=full_data_2.append(a)
        m.append(arr[j])
full_data_2['ProductList']=m

In [218]:
full_data_2.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,label,Product_Number
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/,female,train,4
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D02554/,female,train,4
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28436/,female,train,4
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28437/,female,train,4
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/,male,train,7


In [219]:
full_data_2.count()

session_id        33455
startTime         33455
endTime           33455
ProductList       33455
gender            23251
label             33455
Product_Number    33455
dtype: int64

In [220]:
full_data_2.isnull().sum()

session_id            0
startTime             0
endTime               0
ProductList           0
gender            10204
label                 0
Product_Number        0
dtype: int64

In [221]:
full_data_2.fillna(0,inplace=True)

In [222]:
full_data_2.isnull().sum()

session_id        0
startTime         0
endTime           0
ProductList       0
gender            0
label             0
Product_Number    0
dtype: int64

In [223]:
full_data_2['startTime'] = pd.to_datetime(full_data_2['startTime'], format='%d/%m/%y %H:%M')
full_data_2['endTime'] = pd.to_datetime(full_data_2['endTime'], format='%d/%m/%y %H:%M')
full_data_2['Duration'] = full_data_2['endTime'] - full_data_2['startTime'] 
full_data_2['Duration'] = full_data_2['Duration'].astype('timedelta64[m]')

In [224]:
full_data_2.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,label,Product_Number,Duration
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/,female,train,4,1.0
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D02554/,female,train,4,1.0
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28436/,female,train,4,1.0
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28437/,female,train,4,1.0
1,u10253,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/,male,train,7,6.0


In [225]:
full_data_2['hour']=full_data_2['startTime'].apply(lambda x:x.hour)
full_data_2['month']=full_data_2['startTime'].apply(lambda x:x.month)
full_data_2['weekday']=full_data_2['startTime'].apply(lambda x:x.weekday())
full_data_2['day']=full_data_2['startTime'].apply(lambda x:x.day)

In [226]:
full_data_2.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,label,Product_Number,Duration,hour,month,weekday,day
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/,female,train,4,1.0,18,12,0,15
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D02554/,female,train,4,1.0,18,12,0,15
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28436/,female,train,4,1.0,18,12,0,15
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28437/,female,train,4,1.0,18,12,0,15
1,u10253,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/,male,train,7,6.0,14,12,1,16


In [227]:
bin_labels_5 = ['Winter', 'Spring', 'Summer', 'Fall']
full_data_2['season'] = pd.cut(full_data_2['month'],bins=[1,2, 5,8,11],labels=bin_labels_5)

In [228]:
full_data_2=full_data_2.fillna('Winter')

In [229]:
full_data_2.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,label,Product_Number,Duration,hour,month,weekday,day,season
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/,female,train,4,1.0,18,12,0,15,Winter
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D02554/,female,train,4,1.0,18,12,0,15,Winter
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28436/,female,train,4,1.0,18,12,0,15,Winter
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28437/,female,train,4,1.0,18,12,0,15,Winter
1,u10253,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/,male,train,7,6.0,14,12,1,16,Winter


In [230]:
bin_labels_3 = ['beg', 'mid', 'end']
full_data_2['month_time'] = pd.cut(full_data_2['month'],bins=[0,7,24,31],labels=bin_labels_3)

In [231]:
bin_labels_2 = ['weekdays', 'weekend']
full_data_2['week_time'] = pd.cut(full_data_2['weekday'],bins=[-1,4,6],labels=bin_labels_2)

In [232]:
full_data_2.head(2)

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,label,Product_Number,Duration,hour,month,weekday,day,season,month_time,week_time
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/,female,train,4,1.0,18,12,0,15,Winter,mid,weekdays
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D02554/,female,train,4,1.0,18,12,0,15,Winter,mid,weekdays


In [233]:
full_data_2.insert(5,'A_product',full_data_2['ProductList'])
full_data_2.insert(6,'B_product',full_data_2['ProductList'])
full_data_2.insert(7,'C_product',full_data_2['ProductList'])
full_data_2.insert(8,'D_product',full_data_2['ProductList'])

In [234]:
full_data_2['A_product'] = full_data_2['ProductList'].apply(lambda x:x.split('/')[0])
full_data_2['B_product'] = full_data_2['ProductList'].apply(lambda x:x.split('/')[1])
full_data_2['C_product'] = full_data_2['ProductList'].apply(lambda x:x.split('/')[2])
full_data_2['D_product'] = full_data_2['ProductList'].apply(lambda x:x.split('/')[3])

In [235]:
a=full_data_2['A_product'].nunique()
b=full_data_2['B_product'].nunique()
c=full_data_2['C_product'].nunique()
d=full_data_2['D_product'].nunique()
print(a,b,c,d)

11 86 383 21880


In [236]:
full_data_2.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,A_product,B_product,C_product,D_product,label,Product_Number,Duration,hour,month,weekday,day,season,month_time,week_time
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28435/,female,A00002,B00003,C00006,D28435,train,4,1.0,18,12,0,15,Winter,mid,weekdays
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D02554/,female,A00002,B00003,C00006,D02554,train,4,1.0,18,12,0,15,Winter,mid,weekdays
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28436/,female,A00002,B00003,C00006,D28436,train,4,1.0,18,12,0,15,Winter,mid,weekdays
0,u16159,2014-12-15 18:11:00,2014-12-15 18:12:00,A00002/B00003/C00006/D28437/,female,A00002,B00003,C00006,D28437,train,4,1.0,18,12,0,15,Winter,mid,weekdays
1,u10253,2014-12-16 14:35:00,2014-12-16 14:41:00,A00001/B00009/C00031/D29404/,male,A00001,B00009,C00031,D29404,train,7,6.0,14,12,1,16,Winter,mid,weekdays


In [237]:
full_data_2.drop(['ProductList','startTime','endTime'],axis=1,inplace=True)

In [238]:
full_data_2.head()

Unnamed: 0,session_id,gender,A_product,B_product,C_product,D_product,label,Product_Number,Duration,hour,month,weekday,day,season,month_time,week_time
0,u16159,female,A00002,B00003,C00006,D28435,train,4,1.0,18,12,0,15,Winter,mid,weekdays
0,u16159,female,A00002,B00003,C00006,D02554,train,4,1.0,18,12,0,15,Winter,mid,weekdays
0,u16159,female,A00002,B00003,C00006,D28436,train,4,1.0,18,12,0,15,Winter,mid,weekdays
0,u16159,female,A00002,B00003,C00006,D28437,train,4,1.0,18,12,0,15,Winter,mid,weekdays
1,u10253,male,A00001,B00009,C00031,D29404,train,7,6.0,14,12,1,16,Winter,mid,weekdays


In [239]:
full_data_2.columns

Index(['session_id', 'gender', 'A_product', 'B_product', 'C_product',
       'D_product', 'label', 'Product_Number', 'Duration', 'hour', 'month',
       'weekday', 'day', 'season', 'month_time', 'week_time'],
      dtype='object')

In [240]:
full_data_3=full_data_2.copy()
columns=['hour' ,'day','month','weekday','day']
for col in columns:
    full_data_3[col+'_sin']=np.sin((2*np.pi*full_data_3[col])/max(full_data_3[col]))
    full_data_3[col+'_cos']=np.cos((2*np.pi*full_data_3[col])/max(full_data_3[col]))


full_data_3=full_data_3.drop(columns,axis=1)



In [241]:
full_data_3.head()

Unnamed: 0,session_id,gender,A_product,B_product,C_product,D_product,label,Product_Number,Duration,season,month_time,week_time,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos,weekday_sin,weekday_cos
0,u16159,female,A00002,B00003,C00006,D28435,train,4,1.0,Winter,mid,weekdays,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
0,u16159,female,A00002,B00003,C00006,D02554,train,4,1.0,Winter,mid,weekdays,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
0,u16159,female,A00002,B00003,C00006,D28436,train,4,1.0,Winter,mid,weekdays,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
0,u16159,female,A00002,B00003,C00006,D28437,train,4,1.0,Winter,mid,weekdays,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
1,u10253,male,A00001,B00009,C00031,D29404,train,7,6.0,Winter,mid,weekdays,-0.631088,-0.775711,-0.2079117,-0.978148,-2.449294e-16,1.0,0.866025,0.5


In [242]:
full_data_3.columns

Index(['session_id', 'gender', 'A_product', 'B_product', 'C_product',
       'D_product', 'label', 'Product_Number', 'Duration', 'season',
       'month_time', 'week_time', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
       'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos'],
      dtype='object')

In [243]:
full_data_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33455 entries, 0 to 14999
Data columns (total 20 columns):
session_id        33455 non-null object
gender            33455 non-null object
A_product         33455 non-null object
B_product         33455 non-null object
C_product         33455 non-null object
D_product         33455 non-null object
label             33455 non-null object
Product_Number    33455 non-null int64
Duration          33455 non-null float64
season            33455 non-null category
month_time        33455 non-null category
week_time         33455 non-null category
hour_sin          33455 non-null float64
hour_cos          33455 non-null float64
day_sin           33455 non-null float64
day_cos           33455 non-null float64
month_sin         33455 non-null float64
month_cos         33455 non-null float64
weekday_sin       33455 non-null float64
weekday_cos       33455 non-null float64
dtypes: category(3), float64(9), int64(1), object(7)
memory usage: 4.7+ MB


# Data Encoding

In [244]:
col=['session_id','A_product','B_product','C_product','D_product','season','month_time','week_time']
lbl = LabelEncoder()
for c in col:
  full_data_3[c] = lbl.fit_transform(full_data_3[c])

In [245]:
full_data_3.head()

Unnamed: 0,session_id,gender,A_product,B_product,C_product,D_product,label,Product_Number,Duration,season,month_time,week_time,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos,weekday_sin,weekday_cos
0,6158,female,1,2,5,17574,train,4,1.0,1,0,0,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
0,6158,female,1,2,5,1789,train,4,1.0,1,0,0,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
0,6158,female,1,2,5,17575,train,4,1.0,1,0,0,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
0,6158,female,1,2,5,17576,train,4,1.0,1,0,0,-0.979084,0.203456,5.665539e-16,-1.0,-2.449294e-16,1.0,0.0,1.0
1,252,male,0,8,30,18146,train,7,6.0,1,0,0,-0.631088,-0.775711,-0.2079117,-0.978148,-2.449294e-16,1.0,0.866025,0.5


In [246]:
full_data_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33455 entries, 0 to 14999
Data columns (total 20 columns):
session_id        33455 non-null int32
gender            33455 non-null object
A_product         33455 non-null int32
B_product         33455 non-null int32
C_product         33455 non-null int32
D_product         33455 non-null int32
label             33455 non-null object
Product_Number    33455 non-null int64
Duration          33455 non-null float64
season            33455 non-null int32
month_time        33455 non-null int32
week_time         33455 non-null int32
hour_sin          33455 non-null float64
hour_cos          33455 non-null float64
day_sin           33455 non-null float64
day_cos           33455 non-null float64
month_sin         33455 non-null float64
month_cos         33455 non-null float64
weekday_sin       33455 non-null float64
weekday_cos       33455 non-null float64
dtypes: float64(9), int32(8), int64(1), object(2)
memory usage: 4.3+ MB


# Spliting the data in train,test and val

In [247]:
train_final = full_data_3[full_data_3['label']=='train']
test_final = full_data_3[full_data_3['label']=='test']

In [248]:
train_final.drop('label',axis=1,inplace=True)
test_final.drop('label',axis=1,inplace=True)

In [249]:
test_final.drop('gender',axis=1,inplace=True)
X=train_final.drop('gender',axis=1)
Y=train_final['gender'].map({'male':1,'female':0})

In [250]:
np.random.seed(0)
val_size = 0.25
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=val_size)
print(X.shape)
print(X_train.shape)
print(X_val.shape)

(23251, 18)
(17438, 18)
(5813, 18)


# Modelling Training

## Random Forest

In [251]:
clf= RandomForestClassifier()

clf.fit(X_train,Y_train)

print("accuracy score" + " %s" % accuracy_score(Y_val, clf.predict(X_val)))

accuracy score 0.9482195079993119


## XG BOOST

In [252]:
xgb = XGBClassifier()
xgb.fit(X_train, Y_train)
print("accuracy score" + " %s" % accuracy_score(Y_val, xgb.predict(X_val)))

accuracy score 0.9559607775675211


In [253]:
results=pd.DataFrame()
results['columns']=X_train.columns
results['importances'] =xgb.feature_importances_
results.sort_values(by='importances',ascending=False,inplace=True)

results[:20]

Unnamed: 0,columns,importances
1,A_product,0.557935
0,session_id,0.056906
5,Product_Number,0.038168
9,week_time,0.036046
11,hour_cos,0.034568
13,day_cos,0.030979
16,weekday_sin,0.030252
2,B_product,0.02953
12,day_sin,0.029506
10,hour_sin,0.029101


## XGBoost model after grid search gave Validation accuracy of 96.5% , test accuracy of 94.46%. 

In [254]:

xgb = XGBClassifier()
optimization_dict = {'max_depth': [7,8],
                      'n_estimators': [250,300,350]}

grid_xgb = GridSearchCV(xgb, optimization_dict,cv=10, verbose=1)

grid_xgb.fit(X_train, Y_train)
print(model.best_params_)
print("accuracy score" + " %s" % accuracy_score(Y_val, grid_xgb.predict(X_val)))

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  4.9min finished


{'max_depth': 7, 'n_estimators': 300}
accuracy score 0.965422329261999


In [255]:
test_pred = grid_xgb.predict(test_final)
submission = pd.DataFrame({'session_id':test_final['session_id'],'gender':test_pred})
submission['gender']= submission['gender'].map({1:'female',0:'male'})

#Visualize the first 5 rows
submission.head()


Unnamed: 0,session_id,gender
10500,2111,male
10501,9724,male
10502,1794,male
10503,12638,female
10503,12638,female


In [256]:
# filename = 'Janta_hack_submission.csv'

# submission.to_csv(filename,index=False)

# print('Saved file: ' + filename)

Saved file: Janta_hack_submission.csv


In [None]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label = Y_encoded)

#setting parameters for lightgbm
param = {'num_leaves':30, 'objective':'binary','max_depth':8,'learning_rate':.11, 'max_bin' : 200}
param['metric'] = ['auc', 'accuracy']

lgbmodel = lgb.train(param, train_data, 4000, verbose_eval=True)
