In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from time import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

In [None]:
# Loading data
df_train = pd.read_csv('train_users_2.csv')
df_test = pd.read_csv('test_users.csv')
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

In [None]:
# Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)

In [None]:
df_all['date_account_created'] =  pd.to_datetime(df_all['date_account_created'])

In [None]:
# Extracting day, month & year from date_account_created
df_all['dac_year'] = df_all['date_account_created'].dt.year
df_all['dac_month'] = df_all['date_account_created'].dt.month
df_all['dac_day'] = df_all['date_account_created'].dt.day
df_all = df_all.drop(['date_account_created'], axis=1)

In [None]:
# Extracting day, month & year from timestamp_first_active

def trim_fraction(text):
    if '.0' in text:
        return text[:text.rfind('.0')]
    return text

list1 = []
for i in range(0,len(df_all)):
    list1.append(df_all.timestamp_first_active[i].astype(str))
    
list2 = []
for i in list1:
    list2.append(trim_fraction(i))

df_tfa = pd.DataFrame({"tfa":list2})


tfa = np.vstack(
    df_tfa.tfa.apply(
        lambda x: list(map(int, [x[:4], x[4:6], x[6:8],
                                 x[8:10], x[10:12],
                                 x[12:14]]))
        ).values)

df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

In [None]:
#Age

av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>132), -1, av)

In [None]:
# One-hot-encoding features

ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 
             'affiliate_channel', 'affiliate_provider', 
             'first_affiliate_tracked', 'signup_app', 
             'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

In [None]:
# Splitting train and test

X = df_all.iloc[:piv_train,:]
le = LabelEncoder()
y = le.fit_transform(labels)   
test = df_all.iloc[piv_train:,:]

In [None]:
def test_classifier(X_train, y_train, X_test, y_test, classifier):
    log("")
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    log("Testing " + classifier_name)
    now = time()
    list_of_labels = sorted(list(set(y_train)))
    model = classifier.fit(X_train, y_train)
    log("Learing time {0}s".format(time() - now))
    now = time()
    predictions = model.predict(X_test)
    log("Predicting time {0}s".format(time() - now))

    precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
    log("=================== Results ===================")
    log("F1       " + str(f1))
    log("Precision" + str(precision))
    log("Recall   " + str(recall))
    log("Accuracy " + str(accuracy))
    log("===============================================")

    return precision, recall, accuracy, f1

def log(x):
    #can be used to write to log file
    print(x)

In [None]:
def cv(classifier, X_train, y_train):
    log("===============================================")
    classifier_name = str(type(classifier).__name__)
    now = time()
    log("Crossvalidating " + classifier_name + "...")
    accuracy = [cross_val_score(classifier, X_train, y_train, cv=8, n_jobs=-1)]
    log("Crosvalidation completed in {0}s".format(time() - now))
    log("Accuracy: " + str(accuracy[0]))
    log("Average accuracy: " + str(np.array(accuracy[0]).mean()))
    log("===============================================")
    return accuracy

In [None]:
# Random Forrest Classifier

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7, stratify=y)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, RandomForestClassifier())

In [None]:
# rf_acc = cv(RandomForestClassifier(),X, y)

In [None]:
rf = RandomForestClassifier()
rf.fit(X, y)
y_pred = rf.predict_proba(test)

In [None]:
# Gradient Boosting Classifier

from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7, stratify=y)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1))

In [None]:
# gbm_acc = cv(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1),X, y)

In [None]:
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)
gbm.fit(X, y)
y_pred = nb.predict_proba(test)

In [None]:
# ADA Boost Classifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7, stratify=y)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, AdaBoostClassifier(n_estimators=100, base_estimator= DecisionTreeClassifier(),learning_rate=1))

In [None]:
# ada_acc = cv(AdaBoostClassifier(n_estimators=100, base_estimator= DecisionTreeClassifier(),learning_rate=1),X, y)

In [None]:
ada = AdaBoostClassifier(n_estimators=100, base_estimator= DecisionTreeClassifier(),learning_rate=1)
ada.fit(X, y)
y_pred = nb.predict_proba(test)

In [None]:
# XGBoost Classifier

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7, stratify=y)
precision, recall, accuracy, f1 = test_classifier(X_train, y_train, X_test, y_test, XGBClassifier())

In [None]:
# xgb_acc = cv(XGBClassifier(),X, y)

In [None]:
# Classifier
  
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
xgb.fit(X, y)
y_pred = xgb.predict_proba(test)

In [None]:
# Tuning the XGBoost Classifier parameters 

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            log("Model with rank: {0}".format(i))
            log("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            log("Parameters: {0}".format(results['params'][candidate]))
            log("")

def best_fit(X_train, y_train, n_iter=5):
    
    parameters = {
        "n_estimators":[25,103,201, 403],
        "max_depth":[3,10,15, 30],
        "objective":["multi:softmax",'multi:softprob'],
        "learning_rate":[0.05, 0.1, 0.15, 0.3]
    }

    rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed),param_distributions=parameters,
                                     n_iter=n_iter,scoring="accuracy",
                                     n_jobs=-1,cv=8)

    import time as ttt
    now = time()
    log(ttt.ctime())
    rand_search.fit(X_train, y_train)
    report(rand_search.cv_results_, 10)
    log(ttt.ctime())
    log("Search took: " + str(time() - now))

In [None]:
# best_fit(data_model.iloc[:, 1:], data_model.iloc[:, 0], n_iter=10)

In [None]:
# Taking the 5 classes with highest probabilities

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [None]:
# Generate submission

sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('submission_xgb.csv',index=False)