In [65]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [49]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


In [50]:
data.shape

(10500, 5)

In [51]:
data['session_id'].nunique()

10500

### There are no repeated session ids and so we can drop this column

In [52]:
data.isnull().sum()

session_id     0
startTime      0
endTime        0
ProductList    0
gender         0
dtype: int64

In [53]:
data['ProductList'][0]

'A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B00003/C00006/D28437/'

### The products list are separated by semi-colon. We can split each product into a new row

In [54]:
temp = data['ProductList'].str.split(';')
data = data.reindex(data.index.repeat(temp.apply(len)))
data['product_data'] = np.hstack(temp)

In [55]:
data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,product_data
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D28435/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D02554/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D28436/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,A00002/B00003/C00006/D28437/
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,A00001/B00009/C00031/D29404/


### The product data is separated by forward slash as follows:  category id, sub category id, sub sub category id and product id. We can split this data into 4 columns 

In [56]:
data['category'] = data['product_data'].str.split('/').str[0]
data['sub_category'] = data['product_data'].str.split('/').str[1]
data['sub_sub_category'] = data['product_data'].str.split('/').str[2]
data['product'] = data['product_data'].str.split('/').str[3]

In [57]:
data.drop(['session_id','ProductList', 'product_data'], axis=1, inplace=True)

In [58]:
data.head()

Unnamed: 0,startTime,endTime,gender,category,sub_category,sub_sub_category,product
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28435
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D02554
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28436
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28437
1,16/12/14 14:35,16/12/14 14:41,male,A00001,B00009,C00031,D29404


In [59]:
data['category'].value_counts()

A00002    14931
A00003     3822
A00001     3300
A00005      413
A00004      261
A00006      182
A00011      130
A00010       75
A00007       52
A00009       47
A00008       38
Name: category, dtype: int64

### There are only 11 categories across the entire dataset

In [60]:
data['sub_category'].nunique()

85

In [61]:
data['sub_sub_category'].nunique()

360

In [62]:
data['product'].nunique()

16503

In [63]:
data.head()

Unnamed: 0,startTime,endTime,gender,category,sub_category,sub_sub_category,product
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28435
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D02554
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28436
0,15/12/14 18:11,15/12/14 18:12,female,A00002,B00003,C00006,D28437
1,16/12/14 14:35,16/12/14 14:41,male,A00001,B00009,C00031,D29404


### Fitting a model without considering start time and end time

In [64]:
model_data = data.drop(['startTime', 'endTime'], axis=1)
model_data.head()

Unnamed: 0,gender,category,sub_category,sub_sub_category,product
0,female,A00002,B00003,C00006,D28435
0,female,A00002,B00003,C00006,D02554
0,female,A00002,B00003,C00006,D28436
0,female,A00002,B00003,C00006,D28437
1,male,A00001,B00009,C00031,D29404


### Need to label encode all columns to fit a model.