In [None]:
# Some of the code refered from: https://www.kaggle.com/svpons/feature-engineering

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# session

In [3]:
df_session = pd.read_csv('sessions.csv')

In [4]:
df_session.head(10)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,jpmpj40cvk,lookup,,,Windows Desktop,957.0
5,jpmpj40cvk,search_results,click,view_search_results,Windows Desktop,965.0
6,jpmpj40cvk,search_results,click,view_search_results,Windows Desktop,1021.0
7,jpmpj40cvk,search_results,click,view_search_results,Windows Desktop,938.0
8,jpmpj40cvk,search_results,click,view_search_results,Windows Desktop,2702.0
9,jpmpj40cvk,ajax_refresh_subtotal,click,change_trip_characteristics,Windows Desktop,674.0


In [5]:
df_session.isnull().sum()

user_id            2797
action             7873
action_type      143097
action_detail    143097
device_type           0
secs_elapsed      11067
dtype: int64

In [6]:
df_session.action = df_session.action.fillna("NAN")
df_session.action_type = df_session.action_type.fillna("NAN")
df_session.action_detail = df_session.action_detail.fillna("NAN")
df_session.device_type = df_session.device_type.fillna("NAN")

In [7]:
df_session.action.value_counts()
df_session.action_type.value_counts()
df_session.action_detail.value_counts()
df_session.device_type.value_counts()

Mac Desktop                         478574
Windows Desktop                     326346
Android App Unknown Phone/Tablet     98002
iPad Tablet                          72071
iPhone                               43406
-unknown-                            16245
Tablet                                5322
Linux Desktop                         3552
Chromebook                            3075
Android Phone                         1606
iPodtouch                              289
Blackberry                              53
Windows Phone                           29
Opera Phone                              5
Name: device_type, dtype: int64

In [8]:
# action values with low frequency are changed to "Other"

# threshold
act_freq = 100

act = dict(zip(*np.unique(df_session.action, return_counts=True)))
df_session.action = df_session.action.apply(lambda x : "Other" if act[x] < act_freq else x)

In [9]:
f_act = df_session.action.value_counts().argsort()
f_act_detail = df_session.action_detail.value_counts().argsort()
f_act_type = df_session.action_type.value_counts().argsort()
f_dev_type = df_session.device_type.value_counts().argsort()

In [10]:
print(f_act.size)
print(f_act_detail.size)
print(f_act_type.size)
print(f_dev_type.size)

117
108
10
14


In [11]:
#group session by id.
dgr_sess = df_session.groupby(['user_id'])

In [12]:
len(dgr_sess)

15588

In [13]:
# loop on dgr_sess to create all the features
samples = []
count = 0
ln = len(dgr_sess)

for g in dgr_sess:
    gr = g[1] # date frame that contains all the data from eahc groupby value(user_id)
    
    l = []
    
    # the id
    l.append(g[0])
    
    # number of total actions
    l.append(len(gr))
    
    sev = gr.secs_elapsed.fillna(0).values
    
    #action features
    c_act = [0] * len(f_act)
    for i,v in enumerate(gr.action.values):
        c_act[f_act[v]] += 1
    _, c_act_uqc = np.unique(gr.action.values, return_counts=True)
    c_act += [len(c_act_uqc), np.mean(c_act_uqc), np.std(c_act_uqc)]
    l = l + c_act
    
    #action_detail features
    c_act_detail = [0] * len(f_act_detail)
    for i, v in enumerate(gr.action_detail.values):
        c_act_detail[f_act_detail[v]] += 1
    _, c_act_det_uqc = np.unique(gr.action_detail, return_counts=True)
    c_act_detail += [len(c_act_det_uqc), np.mean(c_act_det_uqc), np.std(c_act_det_uqc)]
    l = l + c_act_detail
    
    #action_type features
    #l_act_type = [0] * len(f_act_type)
    c_act_type = [0] * len(f_act_type)
    for i,v in enumerate(gr.action_type.values):
        #l_act_type[f_act_type[v]] += sev[i]
        c_act_type[f_act_type[v]] += 1
    #l_act_type = np.log(1 + np.array(l_act_type)).tolist()
    _, c_act_type_uqc = np.unique(gr.action_type.values, return_counts = True)
    c_act_type += [len(c_act_type_uqc), np.mean(c_act_type_uqc), np.std(c_act_type_uqc)]
    #1 = 1 + c_act_type + l_act_type
    l = l + c_act_type
    
    #device_type features
    c_dev_type = [0] * len(f_dev_type)
    for i,v in enumerate(gr.device_type.values):
        c_dev_type[f_dev_type[v]] += 1
    c_dev_type.append(len(np.unique(gr.device_type.values)))
    _, c_dev_type_uqc = np.unique(gr.device_type.values, return_counts = True)
    c_dev_type += [len(c_dev_type_uqc), np.mean(c_dev_type_uqc), np.std(c_dev_type_uqc)]
    l = l + c_dev_type
    
    #secs_elapsed features
    l_secs = [0] * 5
    l_log = [0] * 15
    if len(sev) > 0:
        #simpale statistics
        l_secs[0] = np.log(1 + np.sum(sev))
        l_secs[1] = np.log(1 + np.mean(sev))
        l_secs[2] = np.log(1 + np.std(sev))
        l_secs[3] = np.log(1 + np.median(sev))
        l_secs[4] = l_secs[0] / float(l[1])
        
        #values are grouped in 15 intervals
        log_sev = np.log(1 + sev).astype(int)
        l_log = np.bincount(log_sev, minlength = 15).tolist()
    l = l + l_secs + l_log
    
    samples.append(l)
    count += 1

In [14]:
len(samples)

15588

In [15]:
samples = np.array(samples)
samples

array([['007gj7kqdk', '9', '0', ..., '1', '0', '0'],
       ['009a40t3dk', '26', '0', ..., '1', '2', '0'],
       ['00allnceb8', '1', '0', ..., '0', '0', '0'],
       ..., 
       ['zzv8sgicbk', '25', '0', ..., '1', '0', '0'],
       ['zzvatt4dio', '422', '0', ..., '8', '3', '2'],
       ['zzywmcn0jv', '51', '0', ..., '2', '2', '0']], 
      dtype='<U16')

In [16]:
sample_ar = samples[:,1:].astype(np.float16)
sample_id = samples[:,0] # the first element(user_id)

In [17]:
sample_ar

array([[   9.,    0.,    0., ...,    1.,    0.,    0.],
       [  26.,    0.,    0., ...,    1.,    2.,    0.],
       [   1.,    0.,    0., ...,    0.,    0.,    0.],
       ..., 
       [  25.,    0.,    0., ...,    1.,    0.,    0.],
       [ 422.,    0.,    0., ...,    8.,    3.,    2.],
       [  51.,    0.,    0., ...,    2.,    2.,    0.]], dtype=float16)

In [18]:
sample_id

array(['007gj7kqdk', '009a40t3dk', '00allnceb8', ..., 'zzv8sgicbk',
       'zzvatt4dio', 'zzywmcn0jv'], 
      dtype='<U16')

In [19]:
# create dateframe
col_names = []
for i in range(len(samples[0]) - 1):
    col_names.append('c_' + str(i))

df_agg_sess = pd.DataFrame(sample_ar, columns = col_names)
df_agg_sess.id = sample_id
df_agg_sess.index = df_agg_sess.id

In [20]:
df_agg_sess

Unnamed: 0,c_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,...,c_273,c_274,c_275,c_276,c_277,c_278,c_279,c_280,c_281,c_282
007gj7kqdk,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,0.0
009a40t3dk,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,4.0,4.0,5.0,2.0,1.0,1.0,2.0,0.0
00allnceb8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00e8bokexa,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00fhpdik5t,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,5.0,2.0,10.0,4.0,3.0,0.0,0.0,0.0,0.0
00fn6wu77e,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,9.0,4.0,5.0,5.0,3.0,1.0,0.0,2.0,0.0
00fyswof3k,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,7.0,4.0,4.0,11.0,16.0,8.0,3.0,1.0,0.0
00guzlz8b8,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,6.0,8.0,2.0,4.0,4.0,6.0,4.0,1.0,0.0
00iapy7gb3,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,3.0,2.0,2.0,1.0,1.0,0.0,1.0,0.0
00od2fx9cx,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,3.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0


# -------- finish cleanning of session---------

# training and testing data

In [56]:
train = pd.read_csv("train_users.csv")
test = pd.read_csv("test_users.csv")

In [57]:
train.shape

(213451, 16)

In [58]:
test.shape

(62096, 15)

In [59]:
train.dtypes

id                          object
date_account_created        object
timestamp_first_active       int64
date_first_booking          object
gender                      object
age                        float64
signup_method               object
signup_flow                  int64
language                    object
affiliate_channel           object
affiliate_provider          object
first_affiliate_tracked     object
signup_app                  object
first_device_type           object
first_browser               object
country_destination         object
dtype: object

In [64]:
print('training dataset null value')
print(train.isnull().sum(axis=0))
print('--------------------------')
print('test dataset null value')
print(test.isnull().sum(axis=0))

training dataset null value
id                             0
date_account_created           0
timestamp_first_active         0
gender                         0
age                        87990
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked     6065
signup_app                     0
first_device_type              0
first_browser                  0
country_destination            0
dtype: int64
--------------------------
test dataset null value
id                             0
date_account_created           0
timestamp_first_active         0
gender                         0
age                        28876
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked       20
signup_app                     0
first_device_type

In [61]:
# drop date_frist_booking attribute
train.drop(['date_first_booking'], axis = 1, inplace=True)
test.drop(['date_first_booking'], axis = 1, inplace=True)

In [67]:
train.country_destination.value_counts()

NDF      124543
US        62376
other     10094
FR         5023
IT         2835
GB         2324
ES         2249
CA         1428
DE         1061
NL          762
AU          539
PT          217
Name: country_destination, dtype: int64

In [72]:
# the labels we need to predict
labels = train.country_destination.values
train.drop(['country_destination'], axis = 1, inplace=True)

In [74]:
labels
id_test = test['id']

preparing training and testing data

In [90]:
# join the two set (one hot encoding)
df = pd.concat([train, test], axis = 0, ignore_index = True)
df.shape

(275547, 14)

timestamp_first_active

In [97]:
import datetime
tfa = df.timestamp_first_active.astype(str).apply(lambda x: datetime.datetime(int (x[0:4]),
                                                                             int (x[4:6]),
                                                                             int (x[6:8]),
                                                                             int (x[8:10]),
                                                                             int (x[10:12]),
                                                                             int (x[12:])))

In [99]:
tfa.sample(5)

252227   2014-08-25 20:09:54
210705   2014-06-25 22:15:37
24808    2012-05-20 08:36:24
76313    2013-05-11 01:29:55
28949    2012-06-24 05:12:07
Name: timestamp_first_active, dtype: datetime64[ns]

In [100]:
# create feature(year, month, week)
df['tfa_year'] = np.array([x.year for x in tfa])
df['tfa_month'] = np.array([x.month for x in tfa])
df['tfa_week'] = np.array([x.week for x in tfa])

In [103]:
df.columns

Index(['id', 'date_account_created', 'timestamp_first_active', 'gender', 'age',
       'signup_method', 'signup_flow', 'language', 'affiliate_channel',
       'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
       'first_device_type', 'first_browser', 'tfa_year', 'tfa_month',
       'tfa_week'],
      dtype='object')

In [None]:
#create weekday by one hot coding
