# Project 7: Implement a scoring model.

*Pierre-Eloi Ragetly*

This project is part of the Data Scientist path proposed by OpenClassrooms.

In [1]:
# File system management
import os

# Get execution time to compare models
import time

# Import numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.rcParams.update({'axes.edgecolor': 'white',
                     'axes.facecolor': 'white',
                     'axes.linewidth': 2.0,
                     'figure.facecolor': 'white'})

# Where to save the figures
def save_fig(fig_id, tight_layout=True):
    folder_path = os.path.join("charts")
    if not os.path.isdir(folder_path):
        os.makedirs(folder_path)
    path = os.path.join("charts", fig_id + ".png")
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Get all functions required to prepare data
from functions.data_preparation import *

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prepare-the-data" data-toc-modified-id="Prepare-the-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prepare the data</a></span><ul class="toc-item"><li><span><a href="#Read-in-data" data-toc-modified-id="Read-in-data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Read in data</a></span></li><li><span><a href="#Transform-data" data-toc-modified-id="Transform-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Transform data</a></span></li><li><span><a href="#Handle-Categorical-features" data-toc-modified-id="Handle-Categorical-features-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Handle Categorical features</a></span><ul class="toc-item"><li><span><a href="#Fill-in-missing-values" data-toc-modified-id="Fill-in-missing-values-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Fill in missing values</a></span></li><li><span><a href="#Categorical-features-pipeline" data-toc-modified-id="Categorical-features-pipeline-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Categorical features pipeline</a></span></li></ul></li><li><span><a href="#Handle-numerical-features" data-toc-modified-id="Handle-numerical-features-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Handle numerical features</a></span><ul class="toc-item"><li><span><a href="#Ordinal-features-pipeline" data-toc-modified-id="Ordinal-features-pipeline-1.4.1"><span class="toc-item-num">1.4.1&nbsp;&nbsp;</span>Ordinal features pipeline</a></span></li><li><span><a href="#Sparse-features-pipeline" data-toc-modified-id="Sparse-features-pipeline-1.4.2"><span class="toc-item-num">1.4.2&nbsp;&nbsp;</span>Sparse features pipeline</a></span></li><li><span><a href="#Dense-features-pipeline" data-toc-modified-id="Dense-features-pipeline-1.4.3"><span class="toc-item-num">1.4.3&nbsp;&nbsp;</span>Dense features pipeline</a></span></li></ul></li><li><span><a href="#Transformation-Pipelines" data-toc-modified-id="Transformation-Pipelines-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Transformation Pipelines</a></span></li></ul></li><li><span><a href="#Shortlist-Promising-Models" data-toc-modified-id="Shortlist-Promising-Models-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Shortlist Promising Models</a></span><ul class="toc-item"><li><span><a href="#Select-a-Performance-Measure" data-toc-modified-id="Select-a-Performance-Measure-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Select a Performance Measure</a></span></li><li><span><a href="#Establish-a-performance-baseline-with-a-dummy-classifier" data-toc-modified-id="Establish-a-performance-baseline-with-a-dummy-classifier-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Establish a performance baseline with a dummy classifier</a></span></li><li><span><a href="#Train-quick-and-dirty-models-and-compare-their-performance" data-toc-modified-id="Train-quick-and-dirty-models-and-compare-their-performance-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Train quick and dirty models and compare their performance</a></span></li></ul></li><li><span><a href="#Fine-Tune-the-System" data-toc-modified-id="Fine-Tune-the-System-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Fine-Tune the System</a></span><ul class="toc-item"><li><span><a href="#Use-as-much-data-as-possible-by-merging-all-tables" data-toc-modified-id="Use-as-much-data-as-possible-by-merging-all-tables-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Use as much data as possible by merging all tables</a></span></li><li><span><a href="#Feature-Selection" data-toc-modified-id="Feature-Selection-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Feature Selection</a></span></li><li><span><a href="#Data-augmentation-with-SMOTE" data-toc-modified-id="Data-augmentation-with-SMOTE-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Data augmentation with SMOTE</a></span></li><li><span><a href="#Fine-Tune-the-hyperparameters-using-cross-validation" data-toc-modified-id="Fine-Tune-the-hyperparameters-using-cross-validation-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Fine-Tune the hyperparameters using cross-validation</a></span></li><li><span><a href="#Try-ensemble-methods-and-select-the-final-model" data-toc-modified-id="Try-ensemble-methods-and-select-the-final-model-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Try ensemble methods and select the final model</a></span></li></ul></li></ul></div>

## Prepare the data

### Read in data

In [2]:
list_files = sorted(os.listdir("data/"), key=str.lower)
for i, file in enumerate(list_files):
    print("{}) {}".format(i+1, file))

app_test = pd.read_csv("data/" + list_files[0])
app_train = pd.read_csv("data/" + list_files[1])
bureau =  pd.read_csv("data/" + list_files[2])
b_b = pd.read_csv("data/" + list_files[3])
cc_balance = pd.read_csv("data/" + list_files[4])
ins_payments = pd.read_csv("data/" + list_files[6])
pos_cash = pd.read_csv("data/" + list_files[7])
prev_app = pd.read_csv("data/" + list_files[8])

1) application_test.csv
2) application_train.csv
3) bureau.csv
4) bureau_balance.csv
5) credit_card_balance.csv
6) HomeCredit_columns_description.csv
7) installments_payments.csv
8) POS_CASH_balance.csv
9) previous_application.csv
10) sample_submission.csv


### Transform data

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [4]:
# Drop the target and the ID of input data
X = app_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)

# Get the categorical attributes
cat_att = list(X.select_dtypes('object'))

# Get the values to fill missing values
values = drop_na_att(X[cat_att]).value_counts().index[0]

# Get the numerical attributes
num_att = list(X.select_dtypes(['int', 'float']))
ord_att = list(X[num_att].loc[:, X[num_att].nunique()<6])
sparse_att = [c for c in num_att
              if c not in ord_att
              and (X[c]==0).sum() > 0.5*len(X)]
dense_att = [c for c in num_att
             if c not in ord_att
             and c not in sparse_att]
filtered_dense_att = list(drop_na_att(X[dense_att])) + ['DAYS_EMPLOYED_ANOM']

# Create a pipeline with an encoder
# drop the first category in each feature with two categories (drop='if_binary')
cat_pipeline = Pipeline([
               ('filter', FunctionTransformer(drop_na_att)),               
               ('imputer', FunctionTransformer(impute_cat_att,
                                               kw_args={'values': values})),
               ('encoder', OneHotEncoder(drop='if_binary')),
               ])

# Pipeline to prepare numerical ordinal features
ord_pipeline = Pipeline([
               ('filter', FunctionTransformer(drop_na_att)),
               ('imputer', SimpleImputer(strategy='most_frequent')),
               ])

# Pipeline to prepare sparse features with at least 6 distinct values
sparse_pipeline = Pipeline([
                  ('filter', FunctionTransformer(drop_na_att)),
                  ('cleaner', FunctionTransformer(fix_sparse_anomalies)),
                  ('imputer', SimpleImputer(strategy='most_frequent')),
                  ('scaler', MaxAbsScaler())
                  ])

# Pipeline to prepare dense features with at least 6 distinct values
dense_pipeline = Pipeline([
                 ('filter', FunctionTransformer(drop_na_att)),
                 ('cleaner', FunctionTransformer(fix_dense_anomalies)),
                 ('imputer', SimpleImputer()),
                 ('poly_adder', FunctionTransformer(add_polynomial_att,
                                                    kw_args={'names': filtered_dense_att})),
                 ('domain_adder', FunctionTransformer(add_domain_att)),
                 ('skew_transformer', FunctionTransformer(tr_skew_att)),
                 ('scaler', StandardScaler())
                 ])

# Pipeline to prepare all data
full_pipeline = ColumnTransformer([
                ('cat', cat_pipeline, cat_att),
                ('ordinal', ord_pipeline, ord_att),
                ('sparse', sparse_pipeline, sparse_att),
                ('dense', dense_pipeline, dense_att),
                ])

In [5]:
# Prepare data
X_tr = full_pipeline.fit_transform(X)

# Get the name of onehot encoded features
onehot_att = list(drop_na_att(X[cat_att]))
encoder = OneHotEncoder(drop='if_binary')
encoder.fit(impute_cat_att(X[onehot_att], values=values))
onehot_att = list(encoder.get_feature_names(onehot_att))
# Get the name of polynomial attributes
poly_att = ['EXT_SOURCE_2', 'EXT_SOURCE_3']
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)
poly_transformer.fit(X[poly_att].fillna(X[poly_att].mean()))
n = len(poly_att)
poly_att = poly_transformer.get_feature_names(input_features=poly_att)[n:]
# Get the name of domain attributes
domain_att = ['DAYS_EMPLOYED_PERC', 'CREDIT_INCOME_PERC', 'INCOME_PER_PERSON',
              'ANNUITY_INCOME_PERC', 'CREDIT_TERM']
# Get the name of all attributes
extra_att = ['DAYS_EMPLOYED_ANOM'] + poly_att + domain_att
final_att = onehot_att + list(drop_na_att(X[num_att])) + extra_att

df = pd.DataFrame(X_tr, columns=final_att)
df.head()

Unnamed: 0,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_XNA,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,...,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_EMPLOYED_ANOM,EXT_SOURCE_2^2,EXT_SOURCE_2 EXT_SOURCE_3,EXT_SOURCE_3^2,DAYS_EMPLOYED_PERC,CREDIT_INCOME_PERC,INCOME_PER_PERSON,ANNUITY_INCOME_PERC,CREDIT_TERM
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.5176655,-0.468635,-1.350227,-1.632233,-1.608672,-0.685451,-0.755852,1.548683,-0.629679,0.326909
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.092866,-0.468635,0.501725,0.369212,-0.180027,-0.652211,0.56797,0.912393,-0.510993,-1.178242
2,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-1.092866,-0.468635,0.046658,0.99335,1.424589,-1.222743,-0.761159,-0.175347,-0.88814,-0.155923
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-3.831603e-16,-0.468635,0.710676,0.471723,-0.180027,0.151233,-0.558658,-0.175347,0.463536,1.830833
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.092866,-0.468635,-1.146322,-0.719692,-0.180027,0.086094,0.359119,0.747053,0.028661,-0.490159


## Shortlist Promising Models

### Select a Performance Measure

### Establish a performance baseline with a dummy classifier

### Train quick and dirty models and compare their performance

## Fine-Tune the System

### Use as much data as possible by merging all tables

### Feature Selection

### Data augmentation with SMOTE

### Fine-Tune the hyperparameters using cross-validation

### Try ensemble methods and select the final model