In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
%matplotlib inline

le = LabelEncoder()

df_train = pd.read_csv('data/train.csv', index_col='id')
id_train = list(df_train.index)
X_train = df_train.drop('country_destination', axis=1).values
y_train = df_train['country_destination'].values
y_train = le.fit_transform(y_train)

df_test = pd.read_csv('data/test.csv', index_col='id')
id_test = list(df_test.index)
X_test = df_test.values

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.2)

num_class = len(np.unique(y_train))
features = df_test.columns

# Training

In [None]:
def ndcg_score(y_true, y_pred):
    """Normalized discounted cumulative gain (NDCG) at rank K=5."""
    gain = 0.
    order = np.argsort(y_score)
    n_samples = y_true.shape[0]
    n_classes = y_score.shape[1]
    for i in xrange(n_samples):        
        index = n_classes - np.argwhere(order[i] == y_true[i])[0][0]
        if index <= 5:
            gain += 1/np.log2(index + 1)
    return gain / n_samples

def sklearn_ndcg(estimator, X, y):
    y_pred = estimator.predict_proba(X)
    return ndcg_score(y, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

rf = RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=5, min_samples_split=2, 
                             min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                             max_leaf_nodes=None, bootstrap=True, oob_score=False, 
                             n_jobs=4, random_state=None, verbose=0, warm_start=False, class_weight=None)

param_grid_rf = {'n_estimators': [50, 100],
                 'criterion': ['gini', 'entropy'],
                 'max_depth': [5, 10],
                 'class_weight': [None, 'balanced']}

et = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, 
                          oob_score=False, n_jobs=4, random_state=None, verbose=0, warm_start=False, class_weight=None)

param_grid_et = {'n_estimators': [50, 100],
                 'criterion': ['gini', 'entropy'],
                 'max_depth': [5, 10],
                 'class_weight': [None, 'balanced']}


gbm = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, 
                                 min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_depth=3, init=None, random_state=None, max_features=None, verbose=0, 
                                 max_leaf_nodes=None, warm_start=False, presort='auto')

param_grid_gbm = {#'learning_rate': #[0.05, 0.1, 0.2],
                  'n_estimators': [50, 100],
                  'max_depth': [3, 5]}


clf = GridSearchCV(et, param_grid_et, scoring=sklearn_ndcg, fit_params=None, n_jobs=4, iid=True, 
                   refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise')

clf.fit(X_train, y_train)

# Make submission

In [None]:
y_pred = clf.predict_proba(X_test)

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('out/submission.csv',index=False)