In [1]:
#LOADING LIBRARIES
%matplotlib inline
#%pylab inline

import pandas as pd
#import pylab as pl
import numpy as np
import matplotlib as plt
import sklearn
import time
import gc

from sklearn import cross_validation, metrics, neighbors
from sklearn.preprocessing import scale, LabelBinarizer
from sklearn.grid_search import GridSearchCV
from scipy.stats import gaussian_kde
from sklearn.svm import SVC, NuSVC, LinearSVC, SVR, NuSVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, make_scorer, roc_auc_score
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA, RandomizedPCA, TruncatedSVD
from sklearn.linear_model import LinearRegression, Ridge, RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.datasets import load_svmlight_file
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
from scipy import sparse
from multiprocessing import Pool
from sklearn.feature_selection import RFE


gc.enable()
np.random.seed(455)

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)




In [2]:
#LOADING AND JOINING THE DATA

import os
import glob
#print (glob.glob("../kaggle_data/*.csv"))

df_train = pd.read_csv('../kaggle_data/train_users_2.csv')
df_test = pd.read_csv('../kaggle_data/test_users.csv')
labels = df_train['country_destination'].values


id_test = df_test['id']
piv_train = df_train.shape[0]
piv_test = df_test.shape[0]
print ("Total users in train: ",piv_train)
print ("Total users in test: ",piv_test)

df_test['country_destination'] = -1

#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Filling nan
df_all = df_all.fillna(-1)

del df_test, df_train


#print(list(df_all.columns.values))
df_all.head()

Total users in train:  213451
Total users in test:  62096


Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,-1,-unknown-,-1,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,-1,MALE,38,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [3]:
#####Feature engineering#######

#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]


#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)
 

#Removing columns that won't be included into model
df_all = df_all.drop(['id', 'date_first_booking','country_destination','date_account_created',
                     'timestamp_first_active',], axis=1)

#print ("\nVar names:\n",df_all.columns.values)
print ("new columns created")



new columns created


In [4]:
#Analyzing data frame

df_all.head()
print (df_all.shape)


(275547, 161)


In [5]:
# Splitting data into test and train
vals = df_all.values
print (df_all.values)
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
print (y)
X_test = vals[piv_train:]


[[ -1.00000000e+00   2.01000000e+03   6.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  3.80000000e+01   2.01100000e+03   5.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  5.60000000e+01   2.01000000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 ..., 
 [ -1.00000000e+00   2.01400000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -1.00000000e+00   2.01400000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  4.90000000e+01   2.01400000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
[ 7  7 10 ...,  7  7  7]


In [6]:
X_train, X_test, y_train, y_test = train_test_split (X,y,train_size=0.15, random_state=222)
print (X_train.shape)

(21345, 161)


In [21]:
### Feature selection based on RFE

model = LogisticRegression()
# create the RFE model and select 3 attributes
start_time = time.time()

rfe = RFE(model, 25)
rfe = rfe.fit(X_train, y_train)
# summarize the selection of the attributes
print ("RFE model created. Time passed:",time.time() - start_time, "\n\n")

print(rfe.support_)
print(rfe.ranking_)


RFE model created. Time passed: 156.63123607635498 


[False False False False False False False  True  True  True  True  True
  True False False False  True  True  True False False False  True False
 False False False False False False False False False False False False
 False False False  True False False False False False False False  True
 False False False False False False False False False False  True False
  True False  True False False False False False False False False False
 False False  True False False False False False False False False False
 False  True False False  True False  True  True False  True  True False
  True False False False False False False False False  True False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False Fal

In [22]:
sel_features = rfe.ranking_<13

print (sum(sel_features))

36


In [20]:

#Classifier
# 40 minutes 


### Fitting model:

start_time = time.time()

xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0, nthread=-1)   
xgb.fit(X_train[:,sel_features], y_train)

print ("model fitted // time: ",time.time()-start_time)
start_time = time.time()

### Predicting data:

y_pred = xgb.predict_proba(X_test[:,sel_features])  
print ("model - predicting // time: ", time.time()-start_time)


#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('../output/sub.csv',index=False)
print ("modeled data saved")


model fitted // time:  70.26761794090271
model - predicting // time:  60.80919599533081
modeled data saved
