In [26]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.sparse import coo_matrix, hstack, vstack, csr_matrix

#### Reading sessions data 


In [2]:
sessions_data = pd.read_csv("sessions.csv")
sessions_data.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


##### Removing NA and 0 time sessions

In [4]:
sessions_data = sessions_data[-(sessions_data['secs_elapsed']==0 & sessions_data['secs_elapsed'].isnull())]

#### Grouping total time by each user

In [5]:
secs_df = sessions_data[['user_id','secs_elapsed']]
secs_df=secs_df.groupby('user_id', axis=0).sum()
secs_df.reset_index(inplace=True)
secs_df.head()

Unnamed: 0,user_id,secs_elapsed
0,00023iyk9l,867896.0
1,0010k6l0om,586543.0
2,001wyh0pz8,282965.0
3,0028jgx1x1,297010.0
4,002qnbzfs5,6487080.0


In [6]:
# Removing secs_elapsed column
sessions_data.drop('secs_elapsed', axis=1,inplace = True)
sessions_data.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type
0,d1mm9tcy42,lookup,,,Windows Desktop
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop
2,d1mm9tcy42,lookup,,,Windows Desktop
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop
4,d1mm9tcy42,lookup,,,Windows Desktop


## Working on the three actions columns 

#### Filling NAs with "missing" 
- in order not to lose more data we fill NAs with word missing then will handle this by tfidf vectroizer later 

In [7]:
sessions_data['action'].fillna('missing', inplace=True)
sessions_data['action_detail'].fillna('missing', inplace=True)
sessions_data['action_type'].fillna('missing', inplace=True)

sessions_data.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type
0,d1mm9tcy42,lookup,missing,missing,Windows Desktop
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop
2,d1mm9tcy42,lookup,missing,missing,Windows Desktop
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop
4,d1mm9tcy42,lookup,missing,missing,Windows Desktop


#### Removing ( - )
- tfidf considers it as a sperator 

In [9]:
sessions_data.replace("-unknown-","unknown",inplace = True)

#### Remove spaces from device type 

In [10]:
sessions_data['device_type'] = [s.replace(' ', "_") for s in sessions_data['device_type']]

#### Adding space for string concatenation

In [11]:
sessions_data['device_type']   = ["%s "%w for w in sessions_data['device_type']]

sessions_data['action_type']   = ["%s "%w for w in sessions_data['action_type']]

sessions_data['action']        = ["%s "%w for w in sessions_data['action']]
sessions_data['action_detail'] = ["%s "%w for w in sessions_data['action_detail']]

#### Grouping all actions for each user

In [12]:
grouped = sessions_data.groupby('user_id', sort = False)[['action', 'action_type', 'action_detail']].sum()
grouped.head()

Unnamed: 0_level_0,action,action_type,action_detail
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d1mm9tcy42,lookup search_results lookup search_results lo...,missing click missing click missing click miss...,missing view_search_results missing view_searc...
yo8nz8bqcq,dashboard create confirm_email show show_perso...,view submit click view data view data data mis...,dashboard create_user confirm_email_link p3 us...
4grx6yxeby,verify create missing pending requested header...,unknown submit message_post booking_request vi...,unknown create_user message_post pending p5 he...
ncf87guaf0,lookup show search_results search_results show...,missing view click click view view click data ...,missing p3 view_search_results view_search_res...
4rvqpxoh3h,campaigns active create notifications listings...,unknown unknown unknown unknown unknown unknow...,unknown unknown unknown unknown unknown unknow...


#### Using tfidf vectorization for extracting features 
- in order to penelize words and diffreniate among important words 
- for example 'missing' will not be important

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect_action = CountVectorizer()
count_vect_action_detail = CountVectorizer()
count_vect_action_type = CountVectorizer()

tfidf_transformer_action = TfidfTransformer()
tfidf_transformer_action_dtail = TfidfTransformer()
tfidf_transformer_action_type = TfidfTransformer()

action_counts = count_vect_action.fit_transform(grouped['action'])
action_detail_counts = count_vect_action_detail.fit_transform(grouped['action_detail'])
action_type_counts = count_vect_action_type.fit_transform(grouped['action_type'])

action_tfidf = tfidf_transformer_action.fit_transform(action_counts)
action_detail_tfidf = tfidf_transformer_action_dtail.fit_transform(action_detail_counts)
action_type_tfidf = tfidf_transformer_action_type.fit_transform(action_type_counts)


In [24]:
print(action_tfidf.shape)
print(action_detail_tfidf.shape)
print(action_type_tfidf.shape)

(135483, 365)
(135483, 156)
(135483, 11)


#### Adding all in one matrix 

In [43]:
session_all = hstack([action_tfidf, action_detail_tfidf, action_type_tfidf])

#### Adding total time for each user as feature

In [51]:
sesions_features_df = pd.DataFrame(session_all.todense(),index = grouped.index)

In [53]:
secs_df.set_index('user_id',inplace=True)
sesions_features_df =pd.concat([sesions_features_df,secs_df],axis=1,sort=False)

In [54]:
sesions_features_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,523,524,525,526,527,528,529,530,531,secs_elapsed
d1mm9tcy42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.33163,0.526459,0.030854,0.668446,0.0,0.0,0.015038,0.136705,0.38233,3427529.0
yo8nz8bqcq,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.235235,0.640172,0.0,0.256297,0.0,0.0,0.213342,0.0,0.650877,207842.0
4grx6yxeby,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.202138,0.207316,0.0,0.0,0.0,0.404183,0.803738,0.102759,1135444.0
ncf87guaf0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.802575,0.432277,0.0,0.368902,0.0,0.0,0.011373,0.116311,0.138791,3755100.0
4rvqpxoh3h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2555.0


In [56]:
sesions_features_df.to_csv('session_features.csv')