In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
%matplotlib inline

le = LabelEncoder()

df_train = pd.read_csv('data/train.csv', index_col='id')
id_train = list(df_train.index)
X_train = df_train.drop('country_destination', axis=1).values
y_train = df_train['country_destination'].values
y_train = le.fit_transform(y_train)

df_test = pd.read_csv('data/test.csv', index_col='id')
id_test = list(df_test.index)
X_test = df_test.values

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.2)

num_class = len(np.unique(y_train))
features = df_test.columns

# Training

In [17]:
def ndcg_score(y_true, y_pred):
    """Normalized discounted cumulative gain (NDCG) at rank K=5."""
    gain = 0.
    order = np.argsort(y_pred)
    n_samples = y_true.shape[0]
    n_classes = y_pred.shape[1]
    for i in xrange(n_samples):        
        index = n_classes - np.argwhere(order[i] == y_true[i])[0][0]
        if index <= 5:
            gain += 1/np.log2(index + 1)
    return gain / n_samples

def sklearn_ndcg(estimator, X, y):
    y_pred = estimator.predict_proba(X)
    return ndcg_score(y, y_pred)

In [29]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

rf = RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=5, min_samples_split=2, 
                             min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                             max_leaf_nodes=None, bootstrap=True, oob_score=False, 
                             n_jobs=4, random_state=None, verbose=0, warm_start=False, class_weight=None)

param_grid_rf = {'n_estimators': [100, 200, 500],
                  #'criterion': 'gini',
                 'max_depth': [5, 10, 15]
                 #'class_weight': None
                }

et = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, 
                          oob_score=False, n_jobs=4, random_state=None, verbose=0, warm_start=False, class_weight=None)

param_grid_et = {'n_estimators': [25, 50, 75, 100],
                 #'criterion': ['gini', 'entropy'],
                 'max_depth': [5, 10, 15]
                 #,'class_weight': [None, 'balanced']
                 }


gbm = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, 
                                 min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_depth=3, init=None, random_state=None, max_features=None, verbose=0, 
                                 max_leaf_nodes=None, warm_start=False, presort='auto')

param_grid_gbm = {'learning_rate': [0.1, 0.3],
                  'n_estimators': [50, 100, 200],
                  'max_depth': [3, 5]}


clf = GridSearchCV(gbm, param_grid_gbm, scoring=sklearn_ndcg, fit_params=None, n_jobs=12, iid=True, 
                   refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise')

clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
clf.best_params_

In [None]:
from sklearn.metrics import roc_curve, auc

y_pred = clf.predict_proba(X_val)

ndcg_score(y_val, y_pred)

In [None]:
y_pred

# XGBoost

In [30]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
%matplotlib inline

le = LabelEncoder()

df_train = pd.read_csv('data/train.csv', index_col='id')
id_train = list(df_train.index)
X_train = df_train.drop('country_destination', axis=1).values
y_train = df_train['country_destination'].values
y_train = le.fit_transform(y_train)

df_test = pd.read_csv('data/test.csv', index_col='id')
id_test = list(df_test.index)
X_test = df_test.values

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.2)
dval = xgb.DMatrix(X_val, label=y_val)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

num_class = len(np.unique(y_train))
features = df_test.columns

In [31]:
def ndcg_score(y_score, dtrain):
    """Normalized discounted cumulative gain (NDCG) at rank K=5."""
    gain = 0.
    order = np.argsort(y_score)
    y_true = dtrain.get_label()
    n_samples = y_true.shape[0]
    n_classes = y_score.shape[1]
    for i in xrange(n_samples):        
        index = n_classes - np.argwhere(order[i] == y_true[i])[0][0]
        if index <= 5:
            gain += 1/np.log2(index + 1)
    return 'ndcg5', gain / n_samples

In [32]:
import timeit
evallist  = [(dtrain,'train'), (dval, 'eval')]

params = {'max_depth': 3, 'eta': [0.3, 0.5], 'silent': 0,
          'gamma': 0,
          'subsample': [0.5, 0.8], 'colsample_bytree': [0.5, 0.8],
          'nthread': 12, 'objective': 'multi:softprob', 
          'num_class': num_class}
         #'eval_metric': 'ndcg@5'

# %load_ext line_profiler

def xgb_grid_search(param_dict, dtrain):
    """ Perform grid search with cross-validation on XGBoost
    
    Parameters
    ----------
    param_dict: dict
        parameters with (parameter_name, possible values list) pairs
    dtrain: xgboost.DMatrix
        training data
    
    Returns
    -------
    xgboost.booster
        optimal model train on the whole training set
    dict
        optimal parameters dictionary
    
    pandas.Series
        scores for all parameters, stored in multi-indexed Series
    """
    counter = 0
    def init_scores(p):
        """ Initialize Multi-indexed Pandas series used to store scores """
        search_values = filter(lambda x: isinstance(p[x], list), p.keys())  
        search_params = {k: params[k] for k in search_values}
        n_points = reduce(lambda x,y: x*y, map(lambda x:len(x), search_params.values()))
        scores = pd.Series(np.zeros(n_points), index=pd.MultiIndex.from_product(search_params.values()))
        index_params = pd.MultiIndex.from_product(search_params.values(), names=search_params.keys())
        scores = pd.Series(np.zeros(n_points), index=index_params)
        return scores
    scores = init_scores(param_dict)
    search_param_names = scores.index.names
    # Loop over search_param values
    for sp in scores.index:
        start_time = timeit.default_timer()
        counter += 1
        print 'n iteration:' + str(counter)
        #print '    parameters: ' + str(sp)
        for i, p in enumerate(sp):
            param_dict[search_param_names[i]] = p
            print '    '+str(search_param_names[i])+'='+str(p)
        train_scores = xgb.cv(param_dict, dtrain, num_boost_round=200, nfold=5, metrics=(), obj=None, feval=ndcg_score, 
                              maximize=True, early_stopping_rounds=5, fpreproc=None, as_pandas=True, 
                              show_progress=None, show_stdv=True, seed=0)
        final_score = train_scores['test-ndcg5-mean'].iloc[-1]
        scores[sp] = final_score
        elapsed = timeit.default_timer() - start_time
        print '    test score: {:.4f}'.format(final_score)
        print '    iteration time: {:.1f} (s)'.format(elapsed)
    max_score = scores.max()
    max_params = [round(x, 2) for x in scores.idxmax()]
    for i, p in enumerate(max_params):
        param_dict[search_param_names[i]] = p
    bst = xgb.train(param_dict, dtrain, num_boost_round=200, evals=evallist, obj=None, feval=ndcg_score, maximize=True, 
                    early_stopping_rounds=10, evals_result=None, verbose_eval=True, learning_rates=None, 
                    xgb_model=None)
    return bst, param_dict, scores

bst, best_params, scores = xgb_grid_search(params, dtrain)

y_pred_xgb = bst.predict(dtest)  


Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 62


n iteration:1
    subsample=0.5
    eta=0.3
    colsample_bytree=0.5
    test score: 0.8252

Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 20



    iteration time: 683.4 (s)
n iteration:2
    subsample=0.5
    eta=0.3
    colsample_bytree=0.8
    test score: 0.8251

Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 13



    iteration time: 263.7 (s)
n iteration:3
    subsample=0.5
    eta=0.5
    colsample_bytree=0.5
    test score: 0.8247

Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 30



    iteration time: 190.3 (s)
n iteration:4
    subsample=0.5
    eta=0.5
    colsample_bytree=0.8
    test score: 0.8252

Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 40



    iteration time: 365.0 (s)
n iteration:5
    subsample=0.8
    eta=0.3
    colsample_bytree=0.5
    test score: 0.8252

Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 37



    iteration time: 460.0 (s)
n iteration:6
    subsample=0.8
    eta=0.3
    colsample_bytree=0.8
    test score: 0.8254

Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 29



    iteration time: 435.6 (s)
n iteration:7
    subsample=0.8
    eta=0.5
    colsample_bytree=0.5
    test score: 0.8252

Will train until cv error hasn't decreased in 5 rounds.
Stopping. Best iteration: 19



    iteration time: 350.6 (s)
n iteration:8
    subsample=0.8
    eta=0.5
    colsample_bytree=0.8
    test score: 0.8251

Will train until eval error hasn't decreased in 10 rounds.
[0]	train-ndcg5:0.823665	eval-ndcg5:0.822692
[1]	train-ndcg5:0.824293	eval-ndcg5:0.823644
[2]	train-ndcg5:0.824553	eval-ndcg5:0.823727
[3]	train-ndcg5:0.824826	eval-ndcg5:0.823965
[4]	train-ndcg5:0.824914	eval-ndcg5:0.824047
[5]	train-ndcg5:0.824939	eval-ndcg5:0.824004
[6]	train-ndcg5:0.824957	eval-ndcg5:0.824097
[7]	train-ndcg5:0.825131	eval-ndcg5:0.823951
[8]	train-ndcg5:0.825148	eval-ndcg5:0.824340
[9]	train-ndcg5:0.825241	eval-ndcg5:0.824410
[10]	train-ndcg5:0.825226	eval-ndcg5:0.824238



    iteration time: 254.5 (s)


Stopping. Best iteration:
[0]	train-ndcg5:0.823665	eval-ndcg5:0.822692



In [42]:
bst = xgb.train(best_params, dtrain, num_boost_round=200, evals=evallist, obj=None, feval=ndcg_score, maximize=True, 
                    early_stopping_rounds=10, evals_result=None, verbose_eval=True, learning_rates=None, 
                    xgb_model=None)

Will train until eval error hasn't decreased in 10 rounds.
[0]	train-ndcg5:0.823665	eval-ndcg5:0.822692
[1]	train-ndcg5:0.824293	eval-ndcg5:0.823644
[2]	train-ndcg5:0.824553	eval-ndcg5:0.823727
[3]	train-ndcg5:0.824826	eval-ndcg5:0.823965
[4]	train-ndcg5:0.824914	eval-ndcg5:0.824047
[5]	train-ndcg5:0.824939	eval-ndcg5:0.824004
[6]	train-ndcg5:0.824957	eval-ndcg5:0.824097
[7]	train-ndcg5:0.825131	eval-ndcg5:0.823951
[8]	train-ndcg5:0.825148	eval-ndcg5:0.824340
[9]	train-ndcg5:0.825241	eval-ndcg5:0.824410
[10]	train-ndcg5:0.825226	eval-ndcg5:0.824238
[11]	train-ndcg5:0.825249	eval-ndcg5:0.824333
[12]	train-ndcg5:0.825287	eval-ndcg5:0.824136
[13]	train-ndcg5:0.825389	eval-ndcg5:0.824335
[14]	train-ndcg5:0.825442	eval-ndcg5:0.824519
[15]	train-ndcg5:0.825506	eval-ndcg5:0.824521
[16]	train-ndcg5:0.825574	eval-ndcg5:0.824620
[17]	train-ndcg5:0.825677	eval-ndcg5:0.824594
[18]	train-ndcg5:0.825726	eval-ndcg5:0.824807
[19]	train-ndcg5:0.825830	eval-ndcg5:0.824782
[20]	train-ndcg5:0.825861	eval-

# Make submission

In [None]:
y_pred_et

In [27]:
y_pred = clf.predict_proba(X_val)
ndcg_score(y_val, y_pred)

0.82288247047078722

In [24]:
y_pred_et = clf.predict_proba(X_test)

In [28]:
y_pred_rf = clf.predict_proba(X_test)

In [37]:
y_pred_et

array([[  1.56361500e-03,   3.12907105e-03,   1.50842614e-03, ...,
          3.38819787e-04,   2.57373068e-01,   2.81499568e-02],
       [  5.19585445e-04,   1.07438354e-03,   5.13282677e-04, ...,
          2.02314865e-04,   9.30577704e-02,   1.73327046e-02],
       [  1.23251515e-03,   4.52208397e-03,   2.75123199e-03, ...,
          9.39470927e-04,   1.78719398e-01,   4.25390152e-02],
       ..., 
       [  2.08824540e-03,   2.62876159e-03,   2.87012405e-03, ...,
          1.83086139e-03,   1.83572866e-01,   3.16451484e-02],
       [  1.50838059e-03,   4.21871437e-03,   9.68479765e-04, ...,
          6.19536049e-04,   1.56616713e-01,   2.85101530e-02],
       [  1.79635168e-03,   7.69301012e-03,   1.50313065e-02, ...,
          1.05107334e-03,   4.55708604e-01,   5.02186441e-02]])

In [44]:
y_pred_xgb = bst.predict(dtest) 

In [49]:
y_pred = (y_pred_et +  y_pred_xgb) / 2

In [60]:
#Taking the 5 classes with highest probabilities

y_pred = bst.predict(dtest) 

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    tmp = le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

    if 'NDF' in tmp: 
        for j in range(5):
            if tmp[j] == 'NDF':
                search_idx = j
        for j in range(search_idx):
            tmp[j+1] = tmp[j]
        tmp[0] = 'NDF'

    if i in (4,5): 
        print(tmp) 
    cts += tmp

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('out/submission.csv',index=False)

['NDF', 'US', 'other', 'FR', 'IT']
['NDF', 'US', 'other', 'FR', 'IT']


In [52]:
ids

['5uwns89zht',
 '5uwns89zht',
 '5uwns89zht',
 '5uwns89zht',
 '5uwns89zht',
 'jtl0dijy2j',
 'jtl0dijy2j',
 'jtl0dijy2j',
 'jtl0dijy2j',
 'jtl0dijy2j',
 'xx0ulgorjt',
 'xx0ulgorjt',
 'xx0ulgorjt',
 'xx0ulgorjt',
 'xx0ulgorjt',
 '6c6puo6ix0',
 '6c6puo6ix0',
 '6c6puo6ix0',
 '6c6puo6ix0',
 '6c6puo6ix0',
 'czqhjk3yfe',
 'czqhjk3yfe',
 'czqhjk3yfe',
 'czqhjk3yfe',
 'czqhjk3yfe',
 'szx28ujmhf',
 'szx28ujmhf',
 'szx28ujmhf',
 'szx28ujmhf',
 'szx28ujmhf',
 'guenkfjcbq',
 'guenkfjcbq',
 'guenkfjcbq',
 'guenkfjcbq',
 'guenkfjcbq',
 'tkpq0mlugk',
 'tkpq0mlugk',
 'tkpq0mlugk',
 'tkpq0mlugk',
 'tkpq0mlugk',
 '3xtgd5p9dn',
 '3xtgd5p9dn',
 '3xtgd5p9dn',
 '3xtgd5p9dn',
 '3xtgd5p9dn',
 'md9aj22l5a',
 'md9aj22l5a',
 'md9aj22l5a',
 'md9aj22l5a',
 'md9aj22l5a',
 'gg3eswjxdf',
 'gg3eswjxdf',
 'gg3eswjxdf',
 'gg3eswjxdf',
 'gg3eswjxdf',
 'fyomoivygn',
 'fyomoivygn',
 'fyomoivygn',
 'fyomoivygn',
 'fyomoivygn',
 'iq4kkd5oan',
 'iq4kkd5oan',
 'iq4kkd5oan',
 'iq4kkd5oan',
 'iq4kkd5oan',
 '6k1xls6x5j',
 '6k1xls6x