In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import FunctionTransformer, Imputer, OneHotEncoder

In [None]:
import sklearn

In [None]:
from sklearn.model_selection import train_test_split

## Load data

In [None]:
def load_data(load_age_gender=False, load_sessions=False):
    if load_age_gender:
        age_gender = pd.read_csv('../input/age_gender_bkts.csv')
    else:
        age_gender = None
    if load_sessions:
        sessions = pd.read_csv('../input/sessions.csv')
    else:
        sessions = None
    data = pd.read_csv('../input/train_users_2.csv')    
    data.drop(['id', 'date_first_booking'], axis=1, inplace=True)
    return data, age_gender, sessions

In [None]:
data, _, _ = load_data()

In [None]:
data.shape

## Formating

In [None]:
COLUMNS_FORMAT=dict(
    category_columns = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
                        'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
                        'first_device_type', 'first_browser','country_destination'],
    numeric_columns = ['age'],
    date_columns = ['date_account_created','timestamp_first_active']
)

#### Dates columns

In [None]:
data['date_account_created'] = pd.to_datetime(data.date_account_created,
                                                format='%Y-%m-%d')
data['timestamp_first_active'] = pd.to_datetime(data.timestamp_first_active, 
                                                  format='%Y%m%d%H%M%S')

#### Category columns

In [None]:
for c in COLUMNS_FORMAT['category_columns']:
    data[c] = data[c].astype('str')

## Missing values

**Count missing values**

In [None]:
data.shape[0] - data.count()

## Exercice 1: Filling missing values
    * Add a variable indicating whether age value is missing
    * Fill missing age value by the median or average

In [None]:
### Write your code here

### Check

In [None]:
data.shape[0] - data.count()

## Convert categorical to numeric

In [None]:
from sklearn.preprocessing import LabelEncoder

### Try LabelEncoder

**Initiate and fit an LabelEncoder object**

In [None]:
encoder = LabelEncoder()
encoder.fit(data.gender)

** Access to LabelEncoder classes **

In [None]:
encoder.classes_

** Apply encoder to the column ** 

In [None]:
encoder.transform(data.gender)

### Here is a function to create a LabelEncoder for each categorical column

### Exercice 2: Complete this function

In [None]:
def create_label_encoder(df, columns_list):
    label_encoders={} # this dictionary will store the label encoder object
    for c in columns_list:
        # initiate an LabelEncoder object and fit it to column c
        
        
        # store the fitted object in the dictionary
        # label_encoders[c] = 
        pass
    return label_encoders

In [None]:
def apply_label_encoder(df, label_encoders):
    for c in label_encoders.keys():
        df[c] = label_encoders[c].transform(df[c])
    return df

#### Test the functions

In [None]:
label_encoders = create_label_encoder(data, COLUMNS_FORMAT['category_columns'])

In [None]:
data = apply_label_encoder(data, label_encoders)

In [None]:
data.head()

## Transform dates variables to numeric

In this example we will transform date into numeric column by taking the difference with another date or a reference date

### Introduction to timedelta in pandas

Output of a difference of 2 dates columns is in timedelta format

In [None]:
(data.timestamp_first_active - data.date_account_created).head()

Convert timedelta to days 

In [None]:
(data.timestamp_first_active - data.date_account_created).dt.days.head()

### Exercice 3: Transform dates into numeric features

* Create column "activation_delay" as difference in days between first active time and account creation
* Create column "date_creation_float" as difference in days between account creation and an arbitrary date
* Drop the original dates columns

In [None]:
### Write your code here

#### Check

In [None]:
data.head()

## Split_train_test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
split = StratifiedShuffleSplit(data.country_destination, n_iter=1, test_size=0.2)

In [None]:
train_ind, test_ind = train_test_split(data.index, stratify=data.country_destination, 
                                       test_size=0.2, random_state=1234)

In [None]:
len(train_ind), len(test_ind)

In [None]:
X_train = data.iloc[train_ind].drop('country_destination', axis=1)
y_train = data.iloc[train_ind]['country_destination']
X_test = data.iloc[test_ind].drop('country_destination', axis=1)
y_test = data.iloc[test_ind]['country_destination']

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
X_train.head()

## Build ML models

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def test_model(model, X_test, y_test):
    p_test = model.predict_proba(X_test)
    return accuracy_score(y_test, p_test.argmax(axis=1))

In [None]:
model = LogisticRegression(penalty='l2', C=1.0, n_jobs=4)

In [None]:
model.fit(X_train, y_train)

In [None]:
test_model(model, X_test, y_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=10, n_jobs=4, min_samples_leaf=10)

In [None]:
rf.fit(X_train, y_train)

In [None]:
test_model(rf, X_test, y_test)

## Tuning hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Grid search example : Logistic Regression

In [None]:
params_grid = {
    'C': [10, 1, 1e-1, 1e-2, 1e-3]
}
search_ = GridSearchCV(model, params_grid, n_jobs=4, verbose=1, cv=5)

In [None]:
search_.fit(X_train, y_train)

In [None]:
search_.best_score_

In [None]:
search_.best_params_

In [None]:
search_.best_estimator_.score(X_test, y_test)

### Random search example : Random Forest

In [None]:
rf_grid = {
    'n_estimators':[300], 
    'min_samples_leaf':[1, 5, 10, 20],
    'max_depth':[3,5,9, None],
}

In [None]:
search_rf = RandomizedSearchCV(rf, rf_grid, n_iter=5, n_jobs=4, cv=5)

In [None]:
search_rf.fit(X_train, y_train)

In [None]:
search_rf.grid_scores_

In [None]:
search_rf.best_estimator_.score(X_test, y_test)

In [None]:
test_model(search_rf.best_estimator_, X_test, y_test)

# Interpretation

Let's rebuild all the transformation step into one pipeline

In [None]:
def format_columns(df, column_format):
    res = df.copy()
    res['date_account_created'] = pd.to_datetime(res.date_account_created,
                                                format='%Y-%m-%d')
    res['timestamp_first_active'] = pd.to_datetime(res.timestamp_first_active, 
                                                  format='%Y%m%d%H%M%S')
    for c in column_format['category_columns']:
        res.loc[:, c] = res.loc[:, c].astype('str')
    return res

In [None]:
def fill_missing_values(df):
    res = df.copy()
    res['age_missing'] = res.age.isnull().astype('int')
    res['age'] = res.age.fillna(df.age.median())
    return res

In [None]:
def transform_dates(df):
    res = df.copy()
    ### Create an "activation_delay" as difference in days between first active time and account creation
    res['activation_delay'] = (res.timestamp_first_active - res.date_account_created).dt.days
    ### Create an "date_creation_float" as difference in days between account creation and an arbitrary date
    res['date_creation_float'] = (res.date_account_created - pd.to_datetime('2010-01-01')).dt.days
    res.drop(['timestamp_first_active', 'date_account_created'], axis=1, inplace=True)
    return res

In [None]:
def pipeline(raw_data, label_encoders, model):
    data = format_columns(raw_data, COLUMNS_FORMAT)
    data = fill_missing_values(data)
    data = apply_label_encoder(data, label_encoders)
    data = transform_dates(data)
    X = data.drop('country_destination', axis=1)
    
    country_names = label_encoders['country_destination'].classes_
    prediction = pd.DataFrame(model.predict_proba(X), columns=country_names)
    return prediction

In [None]:
raw_data, _, _ = load_data()
raw_data = raw_data.iloc[test_ind]

In [None]:
p_test = pipeline(raw_data, label_encoders, rf)

In [None]:
p_test.head()

In [None]:
accuracy_score(y_test, np.argmax(p_test.values, axis=1))

### Permutation importance

In [None]:
def permutation_importance(predict_function, X, y, loss_function):
    baseline = loss_function(y, predict_function(X))
    feature_list = X.columns
    importance={}
    for i, feature_name in enumerate(feature_list):
        X_permute = X.copy()
        X_permute[feature_name] = np.random.permutation(X_permute[feature_name])
        importance[feature_name] = loss_function(y, predict_function(X_permute)) - baseline
    return pd.Series(importance, name='permutation_importance')

In [None]:
def classif_error(y_true, y_pred):
    return 1 - accuracy_score(y_true, np.argmax(y_pred.values, axis=1))

In [None]:
def predict_function(X):
    return pipeline(X, label_encoders=label_encoders, model=rf)

In [None]:
feature_importance = permutation_importance(predict_function, raw_data, y_test, classif_error)

In [None]:
feature_importance.sort_values().plot(kind='barh')

### Partial dependence

#### Age

In [None]:
def partial_dependence(X, column, values, predict_function):
    result = {}
    for v in values:
        X_copy = X.copy()
        X_copy[column] = v
        result[v] = predict_function(X_copy).mean(axis=0)
    return pd.DataFrame(result).T

In [None]:
age_steps = [raw_data.age.dropna().quantile(q) for q in np.arange(0.1, 1, 0.05)]

In [None]:
age_steps

In [None]:
pd_age = partial_dependence(raw_data, 'age', age_steps, predict_function)

In [None]:
for c in pd_age.columns:
    pd_age[c].plot(title=c)
    plt.show()

#### Signup method

In [None]:
signup_method_val = raw_data.signup_method.unique()

In [None]:
signup_method_val

In [None]:
pd_signup_method = partial_dependence(raw_data, 'signup_method', signup_method_val, predict_function)

In [None]:
for c in pd_signup_method.columns:
    pd_signup_method[c].plot(kind='bar',title=c)
    plt.show()