In [None]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
from IPython.display import display

In [2]:
# sessions
df_sessions = pd.read_csv("Data/sessions.csv")
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1)

In [3]:
# Preparing Session data
print('Working on Session data...')
# Filling with NaN for missing values
df_sessions.action = df_sessions.action.fillna('NAN')
df_sessions.action_type = df_sessions.action_type.fillna('NAN')
df_sessions.action_detail = df_sessions.action_detail.fillna('NAN')
df_sessions.device_type = df_sessions.device_type.fillna('NAN')

# Action values with low frequency are changed to 'OTHER'
act_freq = 100  #Threshold for frequency
act = dict(zip(*np.unique(df_sessions.action, return_counts=True)))
df_sessions.action = df_sessions.action.apply(lambda x: 'OTHER' if act[x] < act_freq else x)

Working on Session data...


In [6]:
# Computing value_counts. These are going to be used in the one-hot encoding
f_act = df_sessions.action.value_counts().argsort()
f_act_detail = df_sessions.action_detail.value_counts().argsort()
f_act_type = df_sessions.action_type.value_counts().argsort()
f_dev_type = df_sessions.device_type.value_counts().argsort()

In [12]:
display(f_act_detail)

view_search_results               155
p3                                152
NAN                               154
-unknown-                         153
wishlist_content_update           151
user_profile                      149
change_trip_characteristics       150
similar_listings                  148
user_social_connections           147
update_listing                    146
listing_reviews                   145
dashboard                         144
user_wishlists                    143
header_userpic                    142
message_thread                    141
edit_profile                      140
message_post                      139
contact_host                      138
unavailable_dates                 137
confirm_email_link                136
create_user                       135
change_contact_host_dates         134
user_profile_content_update       133
user_reviews                      132
p5                                131
login                             130
your_trips  

In [13]:
# Grouping session by id. Will compute features from all rows with the same id.
gr_sess = df_sessions.groupby(['id'])

In [None]:
# Loop on dgr_sess to create all the features.
samples = []
cont = 0
ln = len(gr_sess)
for g in gr_sess:
    if cont%10000 == 0:
        print("%s from %s" %(cont, ln))
    gr = g[1]
    l = []
    
    # Id
    l.append(g[0])
    
    # first feature - number of values/activity count.
    l.append(len(gr))
    
    sev = gr.secs_elapsed.fillna(0).values
    
    # action features
    # (how many times each value occurs, numb of unique values, mean and std)
    c_act = [0] * len(f_act)
    for i,v in enumerate(gr.action.values):
        c_act[f_act[v]] += 1
    _, c_act_uqc = np.unique(gr.action.values, return_counts=True)
    c_act += [len(c_act_uqc), np.mean(c_act_uqc), np.std(c_act_uqc)]
    l = l + c_act
    
    # action_detail features
    # (how many times each value occurs, numb of unique values, mean and std)
    c_act_detail = [0] * len(f_act_detail)
    for i,v in enumerate(gr.action_detail.values):
        c_act_detail[f_act_detail[v]] += 1 
    _, c_act_det_uqc = np.unique(gr.action_detail.values, return_counts=True)
    c_act_detail += [len(c_act_det_uqc), np.mean(c_act_det_uqc), np.std(c_act_det_uqc)]
    l = l + c_act_detail
    
    # action_type features
    # (how many times each value occurs, numb of unique values, mean and std
    # + log of the sum of secs_elapsed for each value)
    l_act_type = [0] * len(f_act_type)
    c_act_type = [0] * len(f_act_type)
    for i,v in enumerate(gr.action_type.values):
        l_act_type[f_act_type[v]] += sev[i]   
        c_act_type[f_act_type[v]] += 1  
    l_act_type = np.log(1 + np.array(l_act_type)).tolist()
    _, c_act_type_uqc = np.unique(gr.action_type.values, return_counts=True)
    c_act_type += [len(c_act_type_uqc), np.mean(c_act_type_uqc), np.std(c_act_type_uqc)]
    l = l + c_act_type + l_act_type    
    
    # device_type features
    # (how many times each value occurs, numb of unique values, mean and std)
    c_dev_type  = [0] * len(f_dev_type)
    for i,v in enumerate(gr.device_type .values):
        c_dev_type[f_dev_type[v]] += 1 
    c_dev_type.append(len(np.unique(gr.device_type.values)))
    _, c_dev_type_uqc = np.unique(gr.device_type.values, return_counts=True)
    c_dev_type += [len(c_dev_type_uqc), np.mean(c_dev_type_uqc), np.std(c_dev_type_uqc)]        
    l = l + c_dev_type    
    
    # Secs_elapsed features        
    l_secs = [0] * 5 
    l_log = [0] * 15
    if len(sev) > 0:
        # Simple statistics about the secs_elapsed values.
        l_secs[0] = np.log(1 + np.sum(sev))
        l_secs[1] = np.log(1 + np.mean(sev)) 
        l_secs[2] = np.log(1 + np.std(sev))
        l_secs[3] = np.log(1 + np.median(sev))
        l_secs[4] = l_secs[0] / float(l[1])
        
        # Values are grouped in 15 intervals. Compute the number of values
        # in each interval.
        log_sev = np.log(1 + sev).astype(int)
        l_log = np.bincount(log_sev, minlength=15).tolist()                      
    l = l + l_secs + l_log
    
    # The list l has the feature values of one sample.
    samples.append(l)
    cont += 1

# Creating a dataframe with the computed features    
col_names = []    #name of the columns
for i in range(len(samples[0])-1):
    col_names.append('c_' + str(i)) 
# preparing objects    
samples = np.array(samples)
samp_ar = samples[:, 1:].astype(np.float16)
samp_id = samples[:, 0]   #The first element is the id of the sample.

# creating the dataframe        
df_agg_sess = pd.DataFrame(samp_ar, columns=col_names)
df_agg_sess['id'] = samp_id
df_agg_sess.index = df_agg_sess.id

In [None]:
# Merging train-test with session data
# df_all = pd.merge(df_tt, df_agg_sess, how='left')
# df_all = df_all.drop(['id'], axis=1)
# df_all = df_all.fillna(-2)  #Missing features for samples without sesssion data.
# #All types of null 
# df_all['all_null'] = np.array([sum(r<0) for r in df_all.values])