## Random Forest Classifier
I used an ensemble model to reduce overfitting of the model since there are many trees 'deciding' the outcome (individual trees are prone to overfitting). This model was used because of convenience, since numerical data are not needed to be standardized, in which $\mu$ and $\sigma$ determined from training data are needed to be used again in standardizing cross validation and test sets.

To find a good parameter, I did 10-fold cross validation of the training data, and searched a **very small** region in the parameter space (to save running time). The evaluation metric used is f1 score (which uses the precision and recall) which is more applicable for skewed labels than using accuracy score (since you can guess that y is all 'no' and you can still get a high accuracy score).

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
def model_findparams(features, labels, model, params):
    """
    finds the optimal parameters of the model from 5-fold cross validation,
    scoring uses f1 score since the labels are skewed towards y=0
    """
    # search parameter space for optimal parameters
    grid = GridSearchCV(model, param_grid=params, cv=5, scoring=make_scorer(f1_score, average='binary'))    
    # Fit the grid search to the data
    grid.fit(features, labels)
    return grid #attributes: grid.best_params_, grid.best_estimator_

def model_predict(estimator, features, labels):
    preds = estimator.predict(features)
    score = f1_score(y_true=labels, y_pred=preds, average='binary')    
    return preds, score

We discard 'duration', 'pdays' (due to extreme skewness), and the external factors. I did not consider external factors since it is not indicated when are they taken and how are they connected to each individual client (data-wise). 

Initially, we get a f1 score of 0.31 and 88.8% classification accuracy (we expect that classification accuracy is higher because the labels are skewed).

In [3]:
if __name__ == "__main__":
    df = pd.read_csv('bank-additional-full.csv', sep=';')
    
    del df['duration']
    del df['pdays']
    del df['emp.var.rate']
    del df['cons.price.idx']
    del df['cons.conf.idx']
    del df['euribor3m']
    del df['nr.employed']

    # log transform numerical data
    df['age'] = np.log(df.age)

    # Encode categorical data
    # combine 'retired', 'student' as 'outside' (i.e. outside the working sector)
    # combine 'housemaid', 'services' as 'services'
    # combine 'admin.', 'management' as 'admin
    # combine 'entrepreneur', 'self-employed' as 'self-employed'
    old_job_keys = ['unknown', 'unemployed', 'retired', 'student', 'blue-collar', 'technician', 'housemaid', 'services', 'admin.', 'management','entrepreneur', 'self-employed']
    new_job_keys = ['unknown', 'unemployed', 'outside', 'outside', 'blue-collar', 'technician', 'services', 'services', 'admin', 'admin', 'self-employed', 'self-employed']
    job_grp_dict = dict(zip(old_job_keys, new_job_keys))
    df['job'] = df['job'].map(job_grp_dict)
    dummy_job = pd.get_dummies(df[['job']])
    df = pd.concat([df, dummy_job], axis=1)
    del df['job']

    dummy_marital = pd.get_dummies(df[['marital']])
    df = pd.concat([df, dummy_marital], axis=1)
    del df['marital']
    
    # combine 'basic.4y', 'basic.6y', 'basic.9y' categories into 'basic'
    old_educ_keys = ['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']
    new_educ_keys = ['unknown', 'illiterate', 'basic', 'basic', 'basic', 'high.school','professional.course','university.degree']
    educ_grp_dict = dict(zip(old_educ_keys, new_educ_keys))
    df['education'] = df['education'].map(educ_grp_dict)
    educ_dict = dict(zip(new_educ_keys, [i for i in range(1,len(new_educ_keys)+1)]))
    df['education'] = df['education'].map(educ_dict)

    default_keys = ['no', 'unknown', 'yes']
    default_dict = dict(zip(default_keys, [-1,0,1]))
    df['default'] = df['default'].map(default_dict)
    inv_default_dict = {val:key for key,val in default_dict.items()}

    housing_keys = ['no', 'unknown', 'yes']
    housing_dict = dict(zip(housing_keys, [-1,0,1]))
    df['housing'] = df['housing'].map(housing_dict)
    inv_housing_dict = {val:key for key,val in housing_dict.items()}

    loan_keys = ['no', 'unknown', 'yes']
    loan_dict = dict(zip(loan_keys, [-1,0,1]))
    df['loan'] = df['loan'].map(loan_dict)
    inv_loan_dict = {val:key for key,val in loan_dict.items()}

    cont_keys = ['telephone','cellular']
    cont_dict = dict(zip(cont_keys, [0,1]))
    df['contact'] = df['contact'].map(cont_dict)
    inv_cont_dict = {val:key for key,val in cont_dict.items()}

    month_keys = ['jan', 'feb', 'mar', 'apr', 'may', 'jun','jul', 'aug', 'sep', 'oct','nov', 'dec'] #df['month'].unique()
    month_dict = dict(zip(month_keys, [i for i in range(1,len(month_keys)+1)]))
    df['month'] = df['month'].map(month_dict)
    inv_month_dict = {val:key for key,val in month_dict.items()}

    week_keys = ['mon','tue','wed','thu','fri']
    weekday_dict = dict(zip(week_keys, [i for i in range(1,len(week_keys)+1)]))
    df['day_of_week'] = df['day_of_week'].map(weekday_dict)
    inv_weekday_dict = {val:key for key,val in weekday_dict.items()}

    pout_keys = ['failure', 'nonexistent', 'success']
    pout_dict = dict(zip(pout_keys, [-1,0,1]))
    df['poutcome'] = df['poutcome'].map(pout_dict)
    inv_pout_dict = {val:key for key,val in pout_dict.items()}

    y_keys = ['no','yes']
    y_dict = dict(zip(y_keys, [0,1]))
    df['y'] = df['y'].map(y_dict)
    inv_cont_dict = {val:key for key,val in y_dict.items()}
    
    #delete very skewed dummy features
    del df['marital_unknown']
    
    # Split the data set
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    X_train, y_train = df_train[df.columns[df.columns != 'y']], df_train['y']
    X_test, y_test = df_test[df.columns[df.columns != 'y']], df_test['y']
    
    # instantiate a random forest classifier
    rfc = RandomForestClassifier(random_state=42)
    
    # search parameter space for optimal parameters
    parameters = {'n_estimators':np.linspace(100, 1000, num=5, dtype=int), 'max_depth': np.linspace(10, 101, num=5, dtype=int)}
    best_model = model_findparams(X_train, y_train, rfc, parameters)
    
    # predict and evaluate the model performance
    preds, score = model_predict(best_model, X_test, y_test)
    accuracy_score = np.mean(preds==y_test)
    print(score)
    print(accuracy_score)

0.3067669172932331
0.8880796309783928
