In [1]:
#LOADING LIBRARIES
%matplotlib inline
#%pylab inline

import pandas as pd
#import pylab as pl
import numpy as np
import matplotlib as plt
import sklearn
import time
import gc

from sklearn import cross_validation, metrics, neighbors
from sklearn.preprocessing import scale, LabelBinarizer
from sklearn.grid_search import GridSearchCV
from scipy.stats import gaussian_kde
from sklearn.svm import SVC, NuSVC, LinearSVC, SVR, NuSVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, make_scorer, roc_auc_score
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA, RandomizedPCA, TruncatedSVD
from sklearn.linear_model import LinearRegression, Ridge, RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.datasets import load_svmlight_file
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
from scipy import sparse
from multiprocessing import Pool


gc.enable()
np.random.seed(455)

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 200)




In [2]:
#LOADING AND JOINING THE DATA

import os
import glob
#print (glob.glob("../kaggle_data/*.csv"))

df_train = pd.read_csv('../kaggle_data/train_users_2.csv')
df_test = pd.read_csv('../kaggle_data/test_users.csv')
labels = df_train['country_destination'].values


id_test = df_test['id']
piv_train = df_train.shape[0]
piv_test = df_test.shape[0]
print ("Total users in train: ",piv_train)
print ("Total users in test: ",piv_test)

df_test['country_destination'] = -1

#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Filling nan
df_all = df_all.fillna(-1)

del df_test, df_train



Total users in train:  213451
Total users in test:  62096


In [3]:
#####Feature engineering#######

#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]


#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)
 

#Removing columns that won't be included into model
df_all = df_all.drop(['id', 'date_first_booking','country_destination','date_account_created',
                     'timestamp_first_active',], axis=1)

#print ("\nVar names:\n",df_all.columns.values)
print ("new columns created")


new columns created


In [4]:
# Splitting data into test and train
vals = df_all.values
print (df_all.values)
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
print (y)
X_test = vals[piv_train:]


[[ -1.00000000e+00   2.01000000e+03   6.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  3.80000000e+01   2.01100000e+03   5.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  5.60000000e+01   2.01000000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 ..., 
 [ -1.00000000e+00   2.01400000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -1.00000000e+00   2.01400000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  4.90000000e+01   2.01400000e+03   9.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
[ 7  7 10 ...,  7  7  7]


In [11]:
X_train, X_test, y_train, y_test = train_test_split (X,y,train_size=0.1, random_state=222)
print (X_train.shape)

(21345, 161)


In [5]:

#Classifier
# 40 minutes 

print ("fitting the mode")
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0, nthread=-1)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  
print ("model fitted")


#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('../output/sub.csv',index=False)
print ("modeled data saved")


fitting the mode
model fitted
modeled data saved
