In [283]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [284]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


In [285]:
data.shape

(10500, 5)

In [286]:
data['session_id'].nunique()

10500

### There are no repeated session ids and so we can drop this column

In [375]:
data['start_datetime'] = pd.to_datetime(data['startTime'])
data['end_datetime'] = pd.to_datetime(data['endTime'])

In [376]:
data['start_date'] = [d.date() for d in data['start_datetime']]
data['start_time'] = [d.time() for d in data['start_datetime']]
data['end_date'] = [d.date() for d in data['end_datetime']]
data['end_time'] = [d.time() for d in data['end_datetime']]

In [377]:
data.drop([''])

Unnamed: 0,session_id,startTime,endTime,gender,category,sub_category,sub_sub_category,product,start_datetime,end_datatime,start_date,start_time,end_datetime,end_date,end_time
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28435,2014-12-15 18:11:00,2014-12-15 18:12:00,2014-12-15,18:11:00,2014-12-15 18:12:00,2014-12-15,18:12:00
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D02554,2014-12-15 18:11:00,2014-12-15 18:12:00,2014-12-15,18:11:00,2014-12-15 18:12:00,2014-12-15,18:12:00
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28436,2014-12-15 18:11:00,2014-12-15 18:12:00,2014-12-15,18:11:00,2014-12-15 18:12:00,2014-12-15,18:12:00
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28437,2014-12-15 18:11:00,2014-12-15 18:12:00,2014-12-15,18:11:00,2014-12-15 18:12:00,2014-12-15,18:12:00
1,u10253,16/12/14 14:35,16/12/14 14:41,male,A00001,B00009,C00031,D29404,2014-12-16 14:35:00,2014-12-16 14:41:00,2014-12-16,14:35:00,2014-12-16 14:41:00,2014-12-16,14:41:00


In [287]:
data.isnull().sum()

session_id     0
startTime      0
endTime        0
ProductList    0
gender         0
dtype: int64

In [288]:
data['ProductList'][0]

'A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B00003/C00006/D28437/'

### The products list are separated by semi-colon. We can split each product into a new row

In [289]:
temp = data['ProductList'].str.split(';')
data = data.reindex(data.index.repeat(temp.apply(len)))
data['product_data'] = np.hstack(temp)

In [290]:
data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,product_data
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D28435/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D02554/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D28436/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D28437/
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,A00001/B00009/C00031/D29404/


### The product data is separated by forward slash as follows:  category id, sub category id, sub sub category id and product id. We can split this data into 4 columns 

In [291]:
data['category'] = data['product_data'].str.split('/').str[0]
data['sub_category'] = data['product_data'].str.split('/').str[1]
data['sub_sub_category'] = data['product_data'].str.split('/').str[2]
data['product'] = data['product_data'].str.split('/').str[3]

In [292]:
data.drop(['ProductList', 'product_data'], axis=1, inplace=True)

In [293]:
data.head()

Unnamed: 0,session_id,startTime,endTime,gender,category,sub_category,sub_sub_category,product
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28435
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D02554
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28436
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28437
1,u10253,16/12/14 14:35,16/12/14 14:41,male,A00001,B00009,C00031,D29404


In [294]:
data['category'].value_counts()

A00002    14931
A00003     3822
A00001     3300
A00005      413
A00004      261
A00006      182
A00011      130
A00010       75
A00007       52
A00009       47
A00008       38
Name: category, dtype: int64

### There are only 11 categories across the entire dataset

In [295]:
data['sub_category'].nunique()

85

In [296]:
data['sub_sub_category'].nunique()

360

In [297]:
data['product'].nunique()

16503

In [298]:
data.head()

Unnamed: 0,session_id,startTime,endTime,gender,category,sub_category,sub_sub_category,product
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28435
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D02554
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28436
0,u16159,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28437
1,u10253,16/12/14 14:35,16/12/14 14:41,male,A00001,B00009,C00031,D29404


### Fitting a model without considering start time and end time

In [312]:
train_data = data.drop(['startTime', 'endTime', 'gender'], axis=1)
train_data.head()

Unnamed: 0,session_id,category,sub_category,sub_sub_category,product
0,u16159,A00002,B00003,C00006,D28435
0,u16159,A00002,B00003,C00006,D02554
0,u16159,A00002,B00003,C00006,D28436
0,u16159,A00002,B00003,C00006,D28437
1,u10253,A00001,B00009,C00031,D29404


In [313]:
y = data['gender']

In [314]:
test_data = pd.read_csv('data/test.csv')
test_data.drop(['startTime', 'endTime'], axis=1, inplace=True)

temp = test_data['ProductList'].str.split(';')
test_data = test_data.reindex(test_data.index.repeat(temp.apply(len)))
test_data['product_data'] = np.hstack(temp)

test_data['category'] = test_data['product_data'].str.split('/').str[0]
test_data['sub_category'] = test_data['product_data'].str.split('/').str[1]
test_data['sub_sub_category'] = test_data['product_data'].str.split('/').str[2]
test_data['product'] = test_data['product_data'].str.split('/').str[3]

test_data.drop(['ProductList', 'product_data'], axis=1, inplace=True)

test_data.head()

Unnamed: 0,session_id,category,sub_category,sub_sub_category,product
0,u12112,A00002,B00003,C00006,D19956
1,u19725,A00002,B00005,C00067,D02026
2,u11795,A00002,B00002,C00004,D12538
3,u22639,A00002,B00003,C00079,D22781
3,u22639,A00002,B00003,C00079,D22782


### To perform label encoding, we need to append train and test data and fit label encoder on it

In [315]:
combined_data = train_data.append(test_data)

In [316]:
le_cat = LabelEncoder()
le_subcat = LabelEncoder()
le_subsubcat = LabelEncoder()
le_product = LabelEncoder()
le_gender = LabelEncoder()
le_session = LabelEncoder()

In [317]:
combined_data['category'] = le_cat.fit_transform(combined_data['category'])
combined_data['sub_category'] = le_subcat.fit_transform(combined_data['sub_category'])
combined_data['sub_sub_category'] = le_subsubcat.fit_transform(combined_data['sub_sub_category'])
combined_data['product'] = le_product.fit_transform(combined_data['product'])
combined_data['session_id'] = le_session.fit_transform(combined_data['session_id'])
y = le_gender.fit_transform(y)

In [318]:
combined_data.head()

Unnamed: 0,session_id,category,sub_category,sub_sub_category,product
0,6158,1,2,5,17574
0,6158,1,2,5,1789
0,6158,1,2,5,17575
0,6158,1,2,5,17576
1,252,0,8,30,18146


In [319]:
train_data['category'] = le_cat.transform(train_data['category'])
train_data['sub_category'] = le_subcat.transform(train_data['sub_category'])
train_data['sub_sub_category'] = le_subsubcat.transform(train_data['sub_sub_category'])
train_data['product'] = le_product.transform(train_data['product'])

In [320]:
train_data['session_id'] = le_session.transform(train_data['session_id'])

In [321]:
test_data['category'] = le_cat.transform(test_data['category'])

In [322]:
test_data['sub_category'] = le_subcat.transform(test_data['sub_category'])

In [323]:
test_data['sub_sub_category'] = le_subsubcat.transform(test_data['sub_sub_category'])

In [324]:
test_data['product'] = le_product.transform(test_data['product'])

In [326]:
test_data['session_id'] = le_session.transform(test_data['session_id'])

In [327]:
test_data.head()

Unnamed: 0,session_id,category,sub_category,sub_sub_category,product
0,2111,1,2,5,12675
1,9724,1,4,66,1407
2,1794,1,1,3,8129
3,12638,1,2,77,14345
3,12638,1,2,77,14346


In [328]:
lr = LogisticRegression()
lr.fit(train_data, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [329]:
y_pred = lr.predict(test_data)

In [332]:
test_data['gender'] = le_gender.inverse_transform(y_pred)

In [338]:
test_data['session_id'] = le_session.inverse_transform(test_data['session_id'])

In [339]:
test_data.head()

Unnamed: 0,session_id,category,sub_category,sub_sub_category,product,gender
0,u12112,1,2,5,12675,female
1,u19725,1,4,66,1407,female
2,u11795,1,1,3,8129,female
3,u22639,1,2,77,14345,female
3,u22639,1,2,77,14346,female


In [340]:
test_data.shape

(10204, 6)

In [341]:
test_new = pd.read_csv('data/test.csv')
test_new.head()

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002/B00003/C00006/D19956/
1,u19725,19/12/14 13:52,19/12/14 13:52,A00002/B00005/C00067/D02026/
2,u11795,01/12/14 10:44,01/12/14 10:44,A00002/B00002/C00004/D12538/
3,u22639,08/12/14 20:19,08/12/14 20:22,A00002/B00003/C00079/D22781/;A00002/B00003/C00...
4,u18034,15/12/14 19:33,15/12/14 19:33,A00002/B00001/C00010/D23419/


In [350]:
test_data = test_data.drop_duplicates(subset=['session_id'])

In [354]:
to_submit = test_data[['session_id', 'gender']]

In [355]:
to_submit.head()

Unnamed: 0,session_id,gender
0,u12112,female
1,u19725,female
2,u11795,female
3,u22639,female
4,u18034,female


In [360]:
to_submit.to_csv('data/first_submission.csv',index = False)

### The above solution resulted in an accuracy of 81% on test data. We have ignored the start time and end time till now. These can be converted into separate features to check if they increase our classification accuracy.