## Walkthrough of Data Science - Traveler

### * Goal: Predict the country that users will make their first booking in, based on some basic user profile data.


#### [1] Pre-processing: Assessing and analyzing data, cleaning, transforming and adding new features
#### [2] Learning model: Constructing and testing learning model
#### [3] Post-processing: Creating final predictions


# LAB 1 CODE

In [7]:
##Exploring Traveler data
import pandas as pd
import matplotlib.pyplot as plt
%pylab inline 

print("Reading data...")
train_file = "/home/ubuntu/datamininglab/DMALab-master/Lab1/train_users_2.csv"
df_train = pd.read_csv(train_file, header = 0,index_col=None)

test_file = "/home/ubuntu/datamininglab/DMALab-master/Lab1/test_users.csv"
df_test = pd.read_csv(test_file, header = 0,index_col=None)

# Combining into one dataset for cleaning
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
print("Reading data...completed")

# Fixing date formats in Pandas - to_datetime
## Change dates to specific format
print("Fixing timestamps...")
df_all['date_account_created'] = pd.to_datetime(df_all['date_account_created'], format='%Y-%m-%d')
df_all['timestamp_first_active'] = pd.to_datetime(df_all['timestamp_first_active'], format='%Y%m%d%H%M%S')
print("Fixing timestamps...completed")

## Removing date_first_booking column
df_all.drop('date_first_booking', axis = 1, inplace = True)
print("Droped date_first_booking column...")

import numpy as np

## Remove outliers function - [1]
def remove_outliers(df, column, min_val, max_val):
    col_values = df[column].values
    df[column] = np.where(np.logical_or(col_values<=min_val, col_values>=max_val), np.NaN, col_values)
    return df

## Fixing age column - [2]
print("Fixing age column...")
df_all = remove_outliers(df = df_all, column = 'age', min_val = 15, max_val = 90)
df_all['age'].fillna(-1, inplace = True)
print("Fixing age column...completed")

# Other column missing value - Fill first_affiliate_tracked column
print("Filling first_affiliate_tracked column...")
df_all['first_affiliate_tracked'].fillna(-1, inplace=True)
print("Filling first_affiliate_tracked column...completed")

df_all.head()

Populating the interactive namespace from numpy and matplotlib
Reading data...
Reading data...completed
Fixing timestamps...
Fixing timestamps...completed
Droped date_first_booking column...
Fixing age column...




Fixing age column...completed
Filling first_affiliate_tracked column...
Filling first_affiliate_tracked column...completed


Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,first_affiliate_tracked,first_browser,first_device_type,gender,id,language,signup_app,signup_flow,signup_method,timestamp_first_active
0,direct,direct,-1.0,NDF,2010-06-28,untracked,Chrome,Mac Desktop,-unknown-,gxn3p5htnn,en,Web,0,facebook,2009-03-19 04:32:55
1,seo,google,38.0,NDF,2011-05-25,untracked,Chrome,Mac Desktop,MALE,820tgsjxq7,en,Web,0,facebook,2009-05-23 17:48:09
2,direct,direct,56.0,US,2010-09-28,untracked,IE,Windows Desktop,FEMALE,4ft3gnwmtx,en,Web,3,basic,2009-06-09 23:12:47
3,direct,direct,42.0,other,2011-12-05,untracked,Firefox,Mac Desktop,FEMALE,bjjt8pjhuk,en,Web,0,facebook,2009-10-31 06:01:29
4,direct,direct,41.0,US,2010-09-14,untracked,Chrome,Mac Desktop,-unknown-,87mebub9p4,en,Web,0,basic,2009-12-08 06:11:05


# LAB 2 CODE

In [8]:
# Own implementation of One Hot Encoding - Data Transformation
def convert_to_binary(df, column_to_convert):
    categories = list(df[column_to_convert].drop_duplicates())

    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert[:5] + '_' + cat_name[:10]
        df[col_name] = 0
        df.loc[(df[column_to_convert] == category), col_name] = 1

    return df

# One Hot Encoding
print("One Hot Encoding categorical data...")
columns_to_convert = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for column in columns_to_convert:
    df_all = convert_to_binary(df=df_all, column_to_convert=column)
    df_all.drop(column, axis=1, inplace=True)
print("One Hot Encoding categorical data...completed")

# Add new date related fields - Creating New Features
print("Adding new fields...")
df_all['day_account_created'] = df_all['date_account_created'].dt.weekday
df_all['month_account_created'] = df_all['date_account_created'].dt.month
df_all['quarter_account_created'] = df_all['date_account_created'].dt.quarter
df_all['year_account_created'] = df_all['date_account_created'].dt.year
df_all['hour_first_active'] = df_all['timestamp_first_active'].dt.hour
df_all['day_first_active'] = df_all['timestamp_first_active'].dt.weekday
df_all['month_first_active'] = df_all['timestamp_first_active'].dt.month
df_all['quarter_first_active'] = df_all['timestamp_first_active'].dt.quarter
df_all['year_first_active'] = df_all['timestamp_first_active'].dt.year
df_all['created_less_active'] = (df_all['date_account_created'] - df_all['timestamp_first_active']).dt.days
print("Adding new fields...completed")


# Drop unnecessary columns
print("Droping fields...")
columns_to_drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'country_destination']
for column in columns_to_drop:
    if column in df_all.columns:
        df_all.drop(column, axis=1, inplace=True)
print("Droping fields...completed")

## Understanding the sessions.csv data
## Loading sessions.csv data
print("Reading sessions data...")
sessions_file = "/home/ubuntu/Downloads/sessions.csv"
df_sessions = pd.read_csv(sessions_file, header = 0,index_col=False)
print("Reading sessions data...completed")

## Cleaning and Transforming the Data
# Determine primary device
print("Determing primary device...")
sessions_device = df_sessions.loc[:, ['user_id', 'device_type', 'secs_elapsed']]
aggregated_lvl1 = sessions_device.groupby(['user_id', 'device_type'], as_index=False, sort=False).aggregate(np.sum)
#aggregated_lvl1.head(10)
idx = aggregated_lvl1.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == aggregated_lvl1['secs_elapsed']
#idx.head(10)
df_sessions_primary = pd.DataFrame(aggregated_lvl1.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
#df_sessions_primary.head(10)
df_sessions_primary.rename(columns = {'device_type':'primary_device', 'secs_elapsed':'primary_secs'}, inplace=True)
#df_sessions_primary.head(10)
# Call user defined One Hot Encoding function
df_sessions_primary = convert_to_binary(df=df_sessions_primary, column_to_convert='primary_device')
#df_sessions_primary.head()
df_sessions_primary.drop('primary_device', axis=1, inplace=True)
#df_sessions_primary.head()
print("Determing primary device...completed")

# Determine Secondary device
print("Determing secondary device...")
remaining = aggregated_lvl1.drop(aggregated_lvl1.index[idx])
remaining.head()
idx = remaining.groupby(['user_id'], sort=False)['secs_elapsed'].transform(max) == remaining['secs_elapsed']
df_sessions_secondary = pd.DataFrame(remaining.loc[idx , ['user_id', 'device_type', 'secs_elapsed']])
df_sessions_secondary.rename(columns = {'device_type':'secondary_device', 'secs_elapsed':'secondary_secs'}, inplace=True)
df_sessions_secondary = convert_to_binary(df=df_sessions_secondary, column_to_convert='secondary_device')
df_sessions_secondary.drop('secondary_device', axis=1, inplace=True)
print("Determing secondary device...completed")

# Determine Counts of Actions - Looping Through the Actions Columns
# Count occurrences of value in a column
def convert_to_counts(df, id_col, column_to_convert):
    id_list = df[id_col].drop_duplicates()

    df_counts = df.loc[:,[id_col, column_to_convert]]
    df_counts['count'] = 1
    df_counts = df_counts.groupby(by=[id_col, column_to_convert], as_index=False, sort=False).sum()

    new_df = df_counts.pivot(index=id_col, columns=column_to_convert, values='count')
    new_df = new_df.fillna(0)

# Rename Columns
    categories = list(df[column_to_convert].drop_duplicates())
    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert + '_' + cat_name
        new_df.rename(columns = {category:col_name}, inplace=True)

    return new_df

# Aggregate and combine actions taken columns
print("Aggregating actions taken...")
session_actions = df_sessions.loc[:,['user_id', 'action', 'action_type', 'action_detail']]
columns_to_convert = ['action', 'action_type', 'action_detail']
session_actions = session_actions.fillna('not provided')
first = True

for column in columns_to_convert:
    print("Converting " + column + " column...")
    current_data = convert_to_counts(df=session_actions, id_col='user_id', column_to_convert=column)

# If first loop, current data becomes existing data, otherwise merge existing and current
if first:
    first = False
    actions_data = current_data
else:
    actions_data = pd.concat([actions_data, current_data], axis=1, join='inner')

# Finally, Combine Data Sets
# [4.1] Merge device datasets - First, combine the two device dataframes (df_primary and df_secondary) to create a device dataframe.
print("Combining results...")
df_sessions_primary.set_index('user_id', inplace=True)
df_sessions_secondary.set_index('user_id', inplace=True)
device_data = pd.concat([df_sessions_primary, df_sessions_secondary], axis=1, join="outer")

# [4.2] Merge device and actions datasets - Then, combine the device dataframe with the actions dataframe to create a sessions dataframe with all the features extracted from sessions.csv
combined_results = pd.concat([device_data, actions_data], axis=1, join='outer')
df_sessions_complete = combined_results.fillna(0)

# [4.3] Merge user and session datasets - Finally, combine the sessions dataframe with the user data dataframe computed earlier
df_all.set_index('id', inplace=True)
df_all = pd.concat([df_all, df_sessions_complete], axis=1, join='inner')
print("Combining results...completed")

df_all.head() # You need get 5 rows × 349 columns

One Hot Encoding categorical data...
One Hot Encoding categorical data...completed
Adding new fields...
Adding new fields...completed
Droping fields...
Droping fields...completed
Reading sessions data...
Reading sessions data...completed
Determing primary device...
Determing primary device...completed
Determing secondary device...
Determing secondary device...completed
Aggregating actions taken...
Converting action column...
Converting action_type column...
Converting action_detail column...
Combining results...
Combining results...completed


Unnamed: 0,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,signu_0,...,action_detail_view_resolutions,action_detail_view_search_results,action_detail_view_security_checks,action_detail_view_user_real_names,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips
d1mm9tcy42,62.0,0,1,0,0,0,1,0,0,1,...,0.0,23.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0
yo8nz8bqcq,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4grx6yxeby,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ncf87guaf0,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,32.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
4rvqpxoh3h,-1.0,1,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_all.shape[1]

349

In [10]:
df_train1=df_train
df__test1=df_test
df_all1=df_all
df_train1.set_index("id",inplace=True)
df_train1=pd.concat([df_train1["country_destination"],df_all1],axis=1,join="inner")


In [11]:
df_train1.shape[1]

350

In [12]:
df_train1.shape

(73815, 350)

In [13]:
df_train1.head()

Unnamed: 0,country_destination,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,...,action_detail_view_resolutions,action_detail_view_search_results,action_detail_view_security_checks,action_detail_view_user_real_names,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips
d1mm9tcy42,other,62.0,0,1,0,0,0,1,0,0,...,0.0,23.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0
yo8nz8bqcq,NDF,-1.0,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4grx6yxeby,NDF,-1.0,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ncf87guaf0,NDF,-1.0,1,0,0,0,0,1,0,0,...,0.0,32.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
4rvqpxoh3h,GB,-1.0,1,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.preprocessing import LabelEncoder

id_train=df_train1.index.values
labels=df_train1["country_destination"]

le=LabelEncoder()
y=le.fit_transform(labels)
x=df_train1.drop("country_destination",axis=1,inplace=False)

In [15]:
x.shape

(73815, 349)

In [20]:
x.head(10)

Unnamed: 0,age,gende_unknown,gende_male,gende_female,gende_other,signu_facebook,signu_basic,signu_google,signu_weibo,signu_0,...,action_detail_view_resolutions,action_detail_view_search_results,action_detail_view_security_checks,action_detail_view_user_real_names,action_detail_wishlist,action_detail_wishlist_content_update,action_detail_wishlist_note,action_detail_your_listings,action_detail_your_reservations,action_detail_your_trips
d1mm9tcy42,62.0,0,1,0,0,0,1,0,0,1,...,0.0,23.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0
yo8nz8bqcq,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4grx6yxeby,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ncf87guaf0,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,32.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
4rvqpxoh3h,-1.0,1,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c8mfesvkv0,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xwxei6hdk4,32.0,0,0,1,0,1,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5f45ro5uzk,-1.0,1,0,0,0,0,1,0,0,1,...,0.0,7.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0
ro2stddszp,19.0,1,0,0,0,0,1,0,0,1,...,0.0,5.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0
qtw88d9pbl,25.0,0,1,0,0,0,1,0,0,1,...,0.0,39.0,0.0,0.0,1.0,43.0,0.0,0.0,0.0,1.0


In [21]:
labels

d1mm9tcy42    other
yo8nz8bqcq      NDF
4grx6yxeby      NDF
ncf87guaf0      NDF
4rvqpxoh3h       GB
c8mfesvkv0      NDF
xwxei6hdk4       US
5f45ro5uzk      NDF
ro2stddszp    other
qtw88d9pbl      NDF
awiurksqr3       US
ucgks2fyez    other
j30uqo74v6      NDF
cuxu2kbaep      NDF
jrqykh9y8x       FR
s9xrwtyzsq       US
1ltangt0hg      NDF
j0mufvhhoh      NDF
mde5wnphhf      NDF
11581i5wng       FR
8s14bomsx5      NDF
ayguekg4tf      NDF
vyv9bj70kd      NDF
zc77z91crs      NDF
7s5yhmyxmj      NDF
toga865pvz      NDF
oa8oz6sj6s       US
i7svun9mus      NDF
w19d8pogt0      NDF
7105aijtcm      NDF
              ...  
c98s3h7kgj      NDF
ytmpiwb8hj      NDF
3dx1jk6yk2      NDF
hcfj07iowv      NDF
l1f71f9vsj      NDF
15bj4ahmhf      NDF
qwpybxfjdl      NDF
k4t61wuvyq      NDF
mhh7b52z44      NDF
79wk7k2k5t      NDF
ftwmocvwlq      NDF
rg7ayg1tob      NDF
2f24umzkuv      NDF
or77n2ojuj      NDF
0a5bnb9bs4      NDF
6fzrn49sfn      NDF
r0jq0devgy      NDF
v5lq9bj8gv      NDF
msucfwmlzc       US


In [22]:
y

array([11,  7,  7, ...,  7,  7,  7])

# APPROACH 01

In [24]:
from sklearn import cross_validation
##splitting of training dataset into 70% training and 30% test data
features_train,features_test,labels_train,labels_test=cross_validation.train_test_split(x,y,test_size=0.3,random_state=42)

In [28]:
##decision Tree
from sklearn import tree
clf=tree.DecisionTreeClassifier()
clf.fit(features_train,labels_train)
prediction=clf.predict(features_test)
prediction

from sklearn.metrics import accuracy_score
print (accuracy_score(prediction,labels_test))

0.574396026191


In [36]:
##GAUSION NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
clf.fit(features_train,labels_train)
prediction=clf.predict(features_test)
prediction

from sklearn.metrics import accuracy_score
print (accuracy_score(prediction,labels_test))

0.602077218334


In [35]:
##svm
#from sklearn import svm
#clf=svm.SVC(kernel="")

# XGBOOST

In [34]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import decomposition, grid_search

#Grid Search - Used to find the best combination of parameters
XGB_model = xgb.XGBClassifier(objective = 'multi:softprob', subsample=0.5, colsample_bytree=0.5, seed = 0)
#softprob : gives probabily whether it belongs to a class
param_grid = {'max_depth':[5], 'learning_rate':[0.1], 'n_estimators':[5]}
model = grid_search.GridSearchCV(estimator = XGB_model, param_grid = param_grid, scoring = 'accuracy', verbose = 10, n_jobs = 1, iid = True, refit = True, cv=3)

model.fit(features_train, labels_train)
print ("Best score: %0.3f" % model.best_score_)
print ("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


    
    
    
    

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] learning_rate=0.1, max_depth=5, n_estimators=5 ..................
[CV]  learning_rate=0.1, max_depth=5, n_estimators=5, score=0.690272 -   6.8s
[CV] learning_rate=0.1, max_depth=5, n_estimators=5 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=5, n_estimators=5, score=0.679981 -   6.5s
[CV] learning_rate=0.1, max_depth=5, n_estimators=5 ..................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.4s remaining:    0.0s


[CV]  learning_rate=0.1, max_depth=5, n_estimators=5, score=0.686433 -   7.3s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   20.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   20.7s finished


Best score: 0.686
Best parameters set:
	learning_rate: 0.1
	max_depth: 5
	n_estimators: 5


In [42]:
from sklearn.metrics import log_loss

#Gradient boosting
#xgb = XGBClassifier(max_depth = 5, learning_rate = 0.1, n_estimators = 5, objective = 'multi:softprob', seed = 0)
#xgb.fit(features_train, labels_train, verbose = False)
#y_gb = xgb.predict_proba(features_test)
y_gb = model.predict_proba(features_test)
print ('{:20s} {:2s} {:1.7f}'.format('XGB_Reg:', 'logloss =>', log_loss(labels_test, y_gb)))

XGB_Reg:             logloss => 1.6335446
