In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from datetime import datetime


## Reading data

In [2]:
training_data = pd.read_csv("train_users_2.csv")
training_data.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


## Training Data cleaning 

  #### Checking which columns contains missing values 

In [3]:
training_data.isnull().any()

id                         False
date_account_created       False
timestamp_first_active     False
date_first_booking          True
gender                     False
age                         True
signup_method              False
signup_flow                False
language                   False
affiliate_channel          False
affiliate_provider         False
first_affiliate_tracked     True
signup_app                 False
first_device_type          False
first_browser              False
country_destination        False
dtype: bool

  #### - Removing outliers from age column
  #### - Filling NAs with mean

In [4]:
training_data['age'].fillna(0,inplace=True)

training_data = training_data.loc[((training_data['age']<95) & (training_data['age']>14) | (training_data['age']==0))]

training_data['age'].replace(0, training_data.loc[-(training_data['age']==0),'age'].mean(),inplace = True)
training_data['age'] = training_data['age'].astype(int)

training_data.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,36,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


  #### - Filling NAs in first_affiliate_tracked with the most frequent 

In [5]:
training_data['first_affiliate_tracked'].fillna(training_data['first_affiliate_tracked'].value_counts().index[0], inplace=True)


 #### - Removing features not going to be used

In [6]:
training_data.drop(['date_account_created','timestamp_first_active','date_first_booking','signup_app'], axis=1,inplace = True)

training_data.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,country_destination
0,gxn3p5htnn,-unknown-,36,facebook,0,en,direct,direct,untracked,Mac Desktop,Chrome,NDF
1,820tgsjxq7,MALE,38,facebook,0,en,seo,google,untracked,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,FEMALE,56,basic,3,en,direct,direct,untracked,Windows Desktop,IE,US
3,bjjt8pjhuk,FEMALE,42,facebook,0,en,direct,direct,untracked,Mac Desktop,Firefox,other
4,87mebub9p4,-unknown-,41,basic,0,en,direct,direct,untracked,Mac Desktop,Chrome,US


In [7]:
temp = training_data.copy()

## Modeling Starts here
   - First we run base model by the features in the training_data 
##### Apply one hot encoding 

In [8]:
y_train = training_data['country_destination']
training_data.drop(['country_destination','id'],axis=1,inplace=True)
X_train = pd.get_dummies(training_data)
X_train.head()

Unnamed: 0,age,signup_flow,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_method_basic,signup_method_facebook,signup_method_google,language_ca,...,first_browser_SeaMonkey,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,36,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,38,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,56,3,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,42,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,41,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

models = [('LR', LogisticRegression()),
         ('KNN', KNeighborsClassifier()),
         ('CART', DecisionTreeClassifier()),
         ('NB', GaussianNB()),
         ('RF', RandomForestClassifier())]
seed = 1073
results = []
names = []
scoring = 'accuracy'
X = X_train
Y = y_train
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

  from numpy.core.umath_tests import inner1d


LR: 0.698676 (0.020771)
KNN: 0.654564 (0.019924)
CART: 0.584955 (0.038180)
NB: 0.004768 (0.001203)
RF: 0.626873 (0.017825)


##### We can see that (Random forest, Logistic Regression and KNN) have much better accuracy than other classifiers

 #### - Now we Try adding extracted features from the sessions file

##### Loading sessions features

In [9]:
sessions = pd.read_csv('session_features.csv')
sessions.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,523,524,525,526,527,528,529,530,531,secs_elapsed
0,d1mm9tcy42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.33163,0.526459,0.030854,0.668446,0.0,0.0,0.015038,0.136705,0.38233,3427529.0
1,yo8nz8bqcq,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.235235,0.640172,0.0,0.256297,0.0,0.0,0.213342,0.0,0.650877,207842.0
2,4grx6yxeby,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.202138,0.207316,0.0,0.0,0.0,0.404183,0.803738,0.102759,1135444.0
3,ncf87guaf0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.802575,0.432277,0.0,0.368902,0.0,0.0,0.011373,0.116311,0.138791,3755100.0
4,4rvqpxoh3h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2555.0


In [10]:
training_data_temp2 = temp.copy()

In [12]:
temp.head()

Unnamed: 0_level_0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,country_destination
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gxn3p5htnn,-unknown-,36,facebook,0,en,direct,direct,untracked,Mac Desktop,Chrome,NDF
820tgsjxq7,MALE,38,facebook,0,en,seo,google,untracked,Mac Desktop,Chrome,NDF
4ft3gnwmtx,FEMALE,56,basic,3,en,direct,direct,untracked,Windows Desktop,IE,US
bjjt8pjhuk,FEMALE,42,facebook,0,en,direct,direct,untracked,Mac Desktop,Firefox,other
87mebub9p4,-unknown-,41,basic,0,en,direct,direct,untracked,Mac Desktop,Chrome,US


##### Merging all features 

In [15]:
# temp.set_index('id',inplace = True)
# y_train = temp['country_destination']
# temp.drop(['country_destination'],axis=1,inplace=True)
# X_train = pd.get_dummies(temp)

# sessions = sessions.rename(columns = {'Unnamed: 0': 'id'})
# X_train.reset_index(inplace = True)
# final_X_train = pd.merge(X_train, sessions, how = 'left', on = 'id')


final_X_train.set_index('id',inplace = True)
final_X_train['country_distnation'] = y_train
final_X_train.head()

Unnamed: 0_level_0,age,signup_flow,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_method_basic,signup_method_facebook,signup_method_google,language_ca,...,524,525,526,527,528,529,530,531,secs_elapsed,country_distnation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gxn3p5htnn,36,0,1,0,0,0,0,1,0,0,...,,,,,,,,,,NDF
820tgsjxq7,38,0,0,0,1,0,0,1,0,0,...,,,,,,,,,,NDF
4ft3gnwmtx,56,3,0,1,0,0,1,0,0,0,...,,,,,,,,,,US
bjjt8pjhuk,42,0,0,1,0,0,0,1,0,0,...,,,,,,,,,,other
87mebub9p4,41,0,1,0,0,0,1,0,0,0,...,,,,,,,,,,US


 #### - Filling NAs sessions with -999
 #### - Remove id column 

In [18]:
final_X_train.reset_index(inplace = True)
final_X_train.drop('id', axis=1,inplace = True)
final_X_train.fillna(-999, inplace = True)
y_train = final_X_train['country_distnation']
final_X_train.drop('country_distnation', axis=1,inplace = True)
final_X_train.head()

Unnamed: 0,age,signup_flow,gender_-unknown-,gender_FEMALE,gender_MALE,gender_OTHER,signup_method_basic,signup_method_facebook,signup_method_google,language_ca,...,523,524,525,526,527,528,529,530,531,secs_elapsed
0,36,0,1,0,0,0,0,1,0,0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
1,38,0,0,0,1,0,0,1,0,0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,56,3,0,1,0,0,1,0,0,0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,42,0,0,1,0,0,0,1,0,0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,41,0,1,0,0,0,1,0,0,0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


#### - Unfortunatly My machine doesn't have the computional power to run the K-fold and try different classifiers on the new training with 661 features 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score,precision_score


models = [('LR', LogisticRegression()),
         ('KNN', KNeighborsClassifier()),
         ('CART', DecisionTreeClassifier()),
         ('NB', GaussianNB()),
         ('RF', RandomForestClassifier())]
seed = 1073
results = []
names = []
scoring = 'accuracy'
X = final_X_train
Y = y_train
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

## Hyper Parameter Tunining

In [30]:
X,X_test,y,y_test=train_test_split(final_X_train,y_train,random_state=0)

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score,precision_score


clf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=80,max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
clf.fit(X,y)
print(clf.score(X_test,y_test))
print(recall_score(y_test,clf.predict(X_test),average='micro'))

0.6295256699603619
0.6295256699603619


In [44]:
clf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=76, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
clf.fit(X,y)
print(clf.score(X_test,y_test))
print(precision_score(y_test,clf.predict(X_test),average=None))
print(recall_score(y_test,clf.predict(X_test),average=None))


0.6290894608075559


  'precision', 'predicted', average, warn_for)


[0.         0.         0.         0.         0.         0.
 0.         0.64628387 0.         0.         0.53936032 0.        ]
[0.         0.         0.         0.         0.         0.
 0.         0.93199075 0.         0.         0.29546777 0.        ]


#### - We can see from (percision and recall) scores that the imbalanced data cuased trouble
#### - Hyper parameter tuning should be automated by running a grid search but the same problem of machine computational power 
#### - Here's the grid search code 

In [None]:
import pprint
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 250, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 7)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, Y)

print( rf_random.best_estimator_ )
print( rf_random.best_score_ )
print( rf_random.best_params_ )

### Downsampling for the training data
- 2000 sample from each class label 
- if the class label count < 2000, then all records with it is included

- After trying downsampling the model underfits due to small number of training data 

In [19]:
# X_train_sample = pd.DataFrame()

# for x in set(final_X_train['country_distnation']):
#     if len(final_X_train.loc[final_X_train['country_distnation'] == x,'country_distnation'])>2000:
#         X_train_sample= X_train_sample.append(final_X_train[final_X_train['country_distnation'] == x].sample(n=2000))
#     else :
#         X_train_sample=X_train_sample.append(final_X_train[final_X_train['country_distnation'] == x])
# X_train_sample.head()

In [20]:
# X_train_sample['country_distnation'].value_counts()
# y_train_sample = X_train_sample['country_distnation']
# X_train_sample.drop('country_distnation', axis=1,inplace = True)