In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [807]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


In [808]:
data.shape

(10500, 5)

### Converting starTime and endTime into datetime format and splitting them into columns

In [809]:
data['start_datetime'] = pd.to_datetime(data['startTime'])
data['end_datetime'] = pd.to_datetime(data['endTime'])

In [810]:
data['start_date'] = [d.date() for d in data['start_datetime']]
data['start_time'] = [d.time() for d in data['start_datetime']]
data['end_date'] = [d.date() for d in data['end_datetime']]
data['end_time'] = [d.time() for d in data['end_datetime']]

In [811]:
data.drop(['startTime', 'endTime', 'start_datetime', 'end_datetime'], axis=1, inplace=True)

In [812]:
data['start_Y'] = data['start_date'].apply(lambda x: x.year)
data['start_M'] = data['start_date'].apply(lambda x: x.month)
data['start_D'] = data['start_date'].apply(lambda x: x.day)
data['end_Y'] = data['end_date'].apply(lambda x: x.year)
data['end_M'] = data['end_date'].apply(lambda x: x.month)
data['end_D'] = data['end_date'].apply(lambda x: x.day)
data.drop(['start_date', 'end_date'], axis=1, inplace=True)

In [813]:
data['start_hour'] = data['start_time'].apply(lambda x: x.hour)
data['start_min'] = data['start_time'].apply(lambda x: x.minute)
data['start_sec'] = data['start_time'].apply(lambda x: x.second)
data['end_hour'] = data['end_time'].apply(lambda x: x.hour)
data['end_min'] = data['end_time'].apply(lambda x: x.minute)
data['end_sec'] = data['end_time'].apply(lambda x: x.second)
data.drop(['start_time', 'end_time'], axis=1, inplace=True)

In [814]:
data.head()

Unnamed: 0,session_id,ProductList,gender,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec
0,u16159,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,2014,12,15,2014,12,15,18,11,0,18,12,0
1,u10253,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,2014,12,16,2014,12,16,14,35,0,14,41,0
2,u19037,A00002/B00001/C00020/D16944/,female,2014,1,12,2014,1,12,15,58,0,15,58,0
3,u14556,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female,2014,11,23,2014,11,23,2,57,0,3,0,0
4,u24295,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male,2014,12,17,2014,12,17,16,44,0,16,46,0


In [815]:
data.isnull().sum()

session_id     0
ProductList    0
gender         0
start_Y        0
start_M        0
start_D        0
end_Y          0
end_M          0
end_D          0
start_hour     0
start_min      0
start_sec      0
end_hour       0
end_min        0
end_sec        0
dtype: int64

In [816]:
data['ProductList'][0]

'A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B00003/C00006/D28437/'

### The products list are separated by semi-colon. We can split each product into a new row

In [817]:
temp = data['ProductList'].str.split(';')
data = data.reindex(data.index.repeat(temp.apply(len)))
data['product_data'] = np.hstack(temp)

In [818]:
data.head()

Unnamed: 0,session_id,ProductList,gender,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec,product_data
0,u16159,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002/B00003/C00006/D28435/
0,u16159,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002/B00003/C00006/D02554/
0,u16159,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002/B00003/C00006/D28436/
0,u16159,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002/B00003/C00006/D28437/
1,u10253,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,2014,12,16,2014,12,16,14,35,0,14,41,0,A00001/B00009/C00031/D29404/


### The product data is separated by forward slash as follows:  category id, sub category id, sub sub category id and product id. We can split this data into 4 columns 

In [819]:
data['category'] = data['product_data'].str.split('/').str[0]
data['sub_category'] = data['product_data'].str.split('/').str[1]
data['sub_sub_category'] = data['product_data'].str.split('/').str[2]
data['product'] = data['product_data'].str.split('/').str[3]

In [820]:
data.drop(['ProductList', 'product_data'], axis=1, inplace=True)

In [821]:
data.head()

Unnamed: 0,session_id,gender,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec,category,sub_category,sub_sub_category,product
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28435
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D02554
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28436
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28437
1,u10253,male,2014,12,16,2014,12,16,14,35,0,14,41,0,A00001,B00009,C00031,D29404


In [822]:
data['category'].value_counts()

A00002    14931
A00003     3822
A00001     3300
A00005      413
A00004      261
A00006      182
A00011      130
A00010       75
A00007       52
A00009       47
A00008       38
Name: category, dtype: int64

### There are only 11 categories across the entire dataset

In [823]:
data['sub_category'].nunique()

85

In [824]:
data['sub_sub_category'].nunique()

360

In [825]:
data['product'].nunique()

16503

In [826]:
data.head()

Unnamed: 0,session_id,gender,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec,category,sub_category,sub_sub_category,product
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28435
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D02554
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28436
0,u16159,female,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28437
1,u10253,male,2014,12,16,2014,12,16,14,35,0,14,41,0,A00001,B00009,C00031,D29404


In [827]:
data.gender.value_counts()

female    18410
male       4841
Name: gender, dtype: int64

In [828]:
train_data = data.drop('gender', axis=1)
train_data.head()

Unnamed: 0,session_id,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec,category,sub_category,sub_sub_category,product
0,u16159,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28435
0,u16159,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D02554
0,u16159,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28436
0,u16159,2014,12,15,2014,12,15,18,11,0,18,12,0,A00002,B00003,C00006,D28437
1,u10253,2014,12,16,2014,12,16,14,35,0,14,41,0,A00001,B00009,C00031,D29404


In [829]:
y = data['gender']

In [830]:
test_data = pd.read_csv('data/test.csv')

test_data['start_datetime'] = pd.to_datetime(test_data['startTime'])
test_data['end_datetime'] = pd.to_datetime(test_data['endTime'])

test_data['start_date'] = [d.date() for d in test_data['start_datetime']]
test_data['start_time'] = [d.time() for d in test_data['start_datetime']]
test_data['end_date'] = [d.date() for d in test_data['end_datetime']]
test_data['end_time'] = [d.time() for d in test_data['end_datetime']]

test_data.drop(['startTime', 'endTime', 'start_datetime', 'end_datetime'], axis=1, inplace=True)

test_data['start_Y'] = test_data['start_date'].apply(lambda x: x.year)
test_data['start_M'] = test_data['start_date'].apply(lambda x: x.month)
test_data['start_D'] = test_data['start_date'].apply(lambda x: x.day)
test_data['end_Y'] = test_data['end_date'].apply(lambda x: x.year)
test_data['end_M'] = test_data['end_date'].apply(lambda x: x.month)
test_data['end_D'] = test_data['end_date'].apply(lambda x: x.day)
test_data.drop(['start_date', 'end_date'], axis=1, inplace=True)

test_data['start_hour'] = test_data['start_time'].apply(lambda x: x.hour)
test_data['start_min'] = test_data['start_time'].apply(lambda x: x.minute)
test_data['start_sec'] = test_data['start_time'].apply(lambda x: x.second)
test_data['end_hour'] = test_data['end_time'].apply(lambda x: x.hour)
test_data['end_min'] = test_data['end_time'].apply(lambda x: x.minute)
test_data['end_sec'] = test_data['end_time'].apply(lambda x: x.second)
test_data.drop(['start_time', 'end_time'], axis=1, inplace=True)


temp = test_data['ProductList'].str.split(';')
test_data = test_data.reindex(test_data.index.repeat(temp.apply(len)))
test_data['product_data'] = np.hstack(temp)

test_data['category'] = test_data['product_data'].str.split('/').str[0]
test_data['sub_category'] = test_data['product_data'].str.split('/').str[1]
test_data['sub_sub_category'] = test_data['product_data'].str.split('/').str[2]
test_data['product'] = test_data['product_data'].str.split('/').str[3]

test_data.drop(['ProductList', 'product_data'], axis=1, inplace=True)

test_data.head()

Unnamed: 0,session_id,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec,category,sub_category,sub_sub_category,product
0,u12112,2014,8,12,2014,8,12,13,36,0,13,36,0,A00002,B00003,C00006,D19956
1,u19725,2014,12,19,2014,12,19,13,52,0,13,52,0,A00002,B00005,C00067,D02026
2,u11795,2014,1,12,2014,1,12,10,44,0,10,44,0,A00002,B00002,C00004,D12538
3,u22639,2014,8,12,2014,8,12,20,19,0,20,22,0,A00002,B00003,C00079,D22781
3,u22639,2014,8,12,2014,8,12,20,19,0,20,22,0,A00002,B00003,C00079,D22782


### To perform label encoding, we need to append train and test data and fit label encoder on it

In [831]:
combined_data = train_data.append(test_data)

In [832]:
le_cat = LabelEncoder()
le_subcat = LabelEncoder()
le_subsubcat = LabelEncoder()
le_product = LabelEncoder()
le_gender = LabelEncoder()
le_session = LabelEncoder()

In [833]:
combined_data['category'] = le_cat.fit_transform(combined_data['category'])
combined_data['sub_category'] = le_subcat.fit_transform(combined_data['sub_category'])
combined_data['sub_sub_category'] = le_subsubcat.fit_transform(combined_data['sub_sub_category'])
combined_data['product'] = le_product.fit_transform(combined_data['product'])
combined_data['session_id'] = le_session.fit_transform(combined_data['session_id'])
y = le_gender.fit_transform(y)

In [834]:
combined_data.head()

Unnamed: 0,session_id,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec,category,sub_category,sub_sub_category,product
0,6158,2014,12,15,2014,12,15,18,11,0,18,12,0,1,2,5,17574
0,6158,2014,12,15,2014,12,15,18,11,0,18,12,0,1,2,5,1789
0,6158,2014,12,15,2014,12,15,18,11,0,18,12,0,1,2,5,17575
0,6158,2014,12,15,2014,12,15,18,11,0,18,12,0,1,2,5,17576
1,252,2014,12,16,2014,12,16,14,35,0,14,41,0,0,8,30,18146


In [835]:
train_data['category'] = le_cat.transform(train_data['category'])
train_data['sub_category'] = le_subcat.transform(train_data['sub_category'])
train_data['sub_sub_category'] = le_subsubcat.transform(train_data['sub_sub_category'])
train_data['product'] = le_product.transform(train_data['product'])

In [836]:
train_data['session_id'] = le_session.transform(train_data['session_id'])

In [837]:
test_data['category'] = le_cat.transform(test_data['category'])

In [838]:
test_data['sub_category'] = le_subcat.transform(test_data['sub_category'])

In [839]:
test_data['sub_sub_category'] = le_subsubcat.transform(test_data['sub_sub_category'])

In [840]:
test_data['product'] = le_product.transform(test_data['product'])

In [841]:
test_data['session_id'] = le_session.transform(test_data['session_id'])

In [842]:
test_data.head()

Unnamed: 0,session_id,start_Y,start_M,start_D,end_Y,end_M,end_D,start_hour,start_min,start_sec,end_hour,end_min,end_sec,category,sub_category,sub_sub_category,product
0,2111,2014,8,12,2014,8,12,13,36,0,13,36,0,1,2,5,12675
1,9724,2014,12,19,2014,12,19,13,52,0,13,52,0,1,4,66,1407
2,1794,2014,1,12,2014,1,12,10,44,0,10,44,0,1,1,3,8129
3,12638,2014,8,12,2014,8,12,20,19,0,20,22,0,1,2,77,14345
3,12638,2014,8,12,2014,8,12,20,19,0,20,22,0,1,2,77,14346


In [843]:
train_data.drop(['start_hour', 'start_min', 'start_sec'], axis=1, inplace=True)
train_data.drop(['end_hour', 'end_min', 'end_sec'], axis=1, inplace=True)

test_data.drop(['start_hour', 'start_min', 'start_sec'], axis=1, inplace=True)
test_data.drop(['end_hour', 'end_min', 'end_sec'], axis=1, inplace=True)

train_data.drop(['start_Y', 'start_M', 'start_D'], axis=1, inplace=True)
train_data.drop(['end_Y', 'end_M', 'end_D'], axis=1, inplace=True)

test_data.drop(['start_Y', 'start_M', 'start_D'], axis=1, inplace=True)
test_data.drop(['end_Y', 'end_M', 'end_D'], axis=1, inplace=True)

### Models without both date and time gave the best test accuracy

In [844]:
# lr = LogisticRegression()
# lr.fit(train_data, y)

In [845]:
# from sklearn.tree import DecisionTreeClassifier
# decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
# decision_tree.fit(train_data, y)

In [309]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# parameters = {
#     "n_estimators":[5,10,50,100,250],
#     "max_depth":[2,4,8,16,32,None]
# }
# clf = RandomForestClassifier()
# cv = GridSearchCV(clf,parameters,cv=5)
# cv.fit(train_data, y)

In [157]:
# cv.best_params_

In [47]:
# xgb = XGBClassifier()
# optimization_dict = {'max_depth': [2,3,4,5,6,7],
#                      'n_estimators': [50,60,70,80,90,100,150,200]}

# model = GridSearchCV(xgb, optimization_dict, 
#                      scoring='accuracy', verbose=1)

# model.fit(train_data, y)
# print(model.best_score_)
# print(model.best_params_)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 26.8min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_con...
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,
                                     verbosity=None),
             iid='warn', n_jobs=None,
             para

In [846]:
model = XGBClassifier(max_depth=7, n_estimators=200)
model.fit(train_data, y)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [847]:
results=pd.DataFrame()
results['columns']=train_data.columns
results['importances'] = model.feature_importances_
results.sort_values(by='importances',ascending=False,inplace=True)

results[:20]

Unnamed: 0,columns,importances
1,category,0.800943
0,session_id,0.082578
2,sub_category,0.044001
3,sub_sub_category,0.038188
4,product,0.03429


In [159]:
# print(model.best_score_)
# print(model.best_params_)

### XGBoost model after grid search gave test accuracy of 93.5%

In [848]:
y_pred = model.predict(test_data)

In [849]:
test_data['gender'] = le_gender.inverse_transform(y_pred)

In [850]:
test_data['session_id'] = le_session.inverse_transform(test_data['session_id'])

In [851]:
test_data.head()

Unnamed: 0,session_id,category,sub_category,sub_sub_category,product,gender
0,u12112,1,2,5,12675,female
1,u19725,1,4,66,1407,female
2,u11795,1,1,3,8129,female
3,u22639,1,2,77,14345,male
3,u22639,1,2,77,14346,male


In [852]:
test_data.shape

(10204, 6)

In [853]:
test_new = pd.read_csv('data/test.csv')
test_new.head()

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002/B00003/C00006/D19956/
1,u19725,19/12/14 13:52,19/12/14 13:52,A00002/B00005/C00067/D02026/
2,u11795,01/12/14 10:44,01/12/14 10:44,A00002/B00002/C00004/D12538/
3,u22639,08/12/14 20:19,08/12/14 20:22,A00002/B00003/C00079/D22781/;A00002/B00003/C00...
4,u18034,15/12/14 19:33,15/12/14 19:33,A00002/B00001/C00010/D23419/


In [854]:
test_data = test_data.drop_duplicates(subset=['session_id'])

In [855]:
to_submit = test_data[['session_id', 'gender']]

In [856]:
to_submit.head()

Unnamed: 0,session_id,gender
0,u12112,female
1,u19725,female
2,u11795,female
3,u22639,male
4,u18034,male


In [857]:
to_submit.gender.value_counts()

female    3627
male       873
Name: gender, dtype: int64

In [858]:
to_submit.to_csv('data/xgb_top_feat.csv',index = False)