# Ejemplo CART, RandomForest, AdaBoost y GradientBoost: Car Insurance Prediction

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier

from mlxtend.classifier import EnsembleVoteClassifier


from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split, GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sn

from joblib import load, dump

seed = 11235813

In [5]:
df_train = pd.read_csv('caso 4.csv')

In [6]:
df_train.isna().sum()

Id                     0
Age                    0
Job                   19
Marital                0
Education            169
Default                0
Balance                0
HHInsurance            0
CarLoan                0
Communication        902
LastContactDay         0
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             3042
CallStart              0
CallEnd                0
CarInsurance           0
dtype: int64

In [2]:
df_train = pd.read_csv('carinsurance/carInsurance_train.csv')
df_test = pd.read_csv('carinsurance/carInsurance_test.csv')

df_train.sample(5)

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance
2563,2564,35,management,married,secondary,0,724,1,0,cellular,14,jul,2,415,1,failure,15:07:33,15:12:20,1
2439,2440,28,blue-collar,single,secondary,0,1112,1,0,,16,may,2,-1,0,,09:53:33,10:07:20,0
3179,3180,46,technician,married,secondary,0,1167,1,0,cellular,21,nov,2,-1,0,,16:41:32,17:07:12,1
3944,3945,45,entrepreneur,married,,0,3133,1,1,cellular,10,jul,1,-1,0,,16:39:50,16:53:14,1
2822,2823,31,technician,single,secondary,0,53,1,1,cellular,6,may,1,-1,0,,13:22:44,13:31:34,1


In [42]:
df_train.describe()

Unnamed: 0,Id,Age,Default,Balance,HHInsurance,CarLoan,LastContactDay,NoOfContacts,DaysPassed,PrevAttempts,CarInsurance
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,2000.5,41.21475,0.0145,1532.93725,0.49275,0.133,15.72125,2.60725,48.7065,0.7175,0.401
std,1154.844867,11.550194,0.119555,3511.452489,0.50001,0.339617,8.425307,3.064204,106.685385,2.078647,0.490162
min,1.0,18.0,0.0,-3058.0,0.0,0.0,1.0,1.0,-1.0,0.0,0.0
25%,1000.75,32.0,0.0,111.0,0.0,0.0,8.0,1.0,-1.0,0.0,0.0
50%,2000.5,39.0,0.0,551.5,0.0,0.0,16.0,2.0,-1.0,0.0,0.0
75%,3000.25,49.0,0.0,1619.0,1.0,0.0,22.0,3.0,-1.0,0.0,1.0
max,4000.0,95.0,1.0,98417.0,1.0,1.0,31.0,43.0,854.0,58.0,1.0


In [3]:
df_train.isna().sum()

Id                     0
Age                    0
Job                   19
Marital                0
Education            169
Default                0
Balance                0
HHInsurance            0
CarLoan                0
Communication        902
LastContactDay         0
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             3042
CallStart              0
CallEnd                0
CarInsurance           0
dtype: int64

----
## Diccionario de Datos
<img src= 'carinsurance/data_dict.png'>

**El objetivo será predecir si la persona que fue contactada comprará un seguro automotriz o no.**

----

# Análisis Descriptivo y Exploratorio

In [3]:
df_train.dtypes

Id                   int64
Age                  int64
Job                 object
Marital             object
Education           object
Default              int64
Balance              int64
HHInsurance          int64
CarLoan              int64
Communication       object
LastContactDay       int64
LastContactMonth    object
NoOfContacts         int64
DaysPassed           int64
PrevAttempts         int64
Outcome             object
CallStart           object
CallEnd             object
CarInsurance         int64
dtype: object

In [134]:
ProfileReport(df_train)



### Notas e ideas:
* Transformar `CallStart` y `CallEnd` a datetime y calcular la diferencia en minutos, usar este tiempo de duración de la llamada como atributo en lugar de estas dos columnas.


* La columna `DaysPassed` codifica como $-1$ cuando el cliente no fue contactado anteriormente y por lo tanto no ha pasado un cantidad de días hasta el moemnto del contacto, puesto que dicha columna no usa $0$ dentro de la codificación voy a recodificar esos valores como $0$ y la información de si ha sido contactado antes o no la representaré en otra columna nueva.


* El mes y el dia en el que fue el último contacto (`LastContactMonth` y `LastContactDay`) no me parecen relevantes, creo que es más informativa la cantidad de dias desde el último contacto por lo que las eliminaré.

-----
# Feature Engineering

### Data-type changes 
> * `Default` --> `category`
> * `HHInsurance` --> `category`
> * `Carloan` --> `category`
> * `CallStart` --> `DateTime`
> * `CallEnd` --> `DateTime`
> * `CarInsurance` --> `category`
> * `Job` --> `category`
> * `Marital` --> `category`
> * `Education` --> `category`
> * `Communication` --> `category`
> * `Outcome` --> `category`

In [3]:
## Data-type changes for Train sample

df_train.Default = pd.Categorical(df_train.Default.replace({0: 'no', 1:'yes'}))
df_train.HHInsurance = pd.Categorical(df_train.HHInsurance.replace({0: 'no', 1:'yes'}))
df_train.CarLoan = pd.Categorical(df_train.CarLoan.replace({0: 'no', 1:'yes'}))
df_train.CarInsurance = pd.Categorical(df_train.CarInsurance.replace({0: 'no', 1:'yes'}))
df_train.Default = pd.Categorical(df_train.Default.replace({0: 'no', 1:'yes'}))
df_train.Job = pd.Categorical(df_train.Job)
df_train.Marital = pd.Categorical(df_train.Marital)
df_train.Education = pd.Categorical(df_train.Education)
df_train.Communication = pd.Categorical(df_train.Communication)
df_train.Outcome = pd.Categorical(df_train.Outcome)

In [4]:
## Data-type changes for Test sample

df_test.Default = pd.Categorical(df_test.Default.replace({0: 'no', 1:'yes'}))
df_test.HHInsurance = pd.Categorical(df_test.HHInsurance.replace({0: 'no', 1:'yes'}))
df_test.CarLoan = pd.Categorical(df_test.CarLoan.replace({0: 'no', 1:'yes'}))
df_test.CarInsurance = pd.Categorical(df_test.CarInsurance.replace({0: 'no', 1:'yes'}))
df_test.Default = pd.Categorical(df_test.Default.replace({0: 'no', 1:'yes'}))
df_test.Job = pd.Categorical(df_test.Job)
df_test.Marital = pd.Categorical(df_test.Marital)
df_test.Education = pd.Categorical(df_test.Education)
df_test.Communication = pd.Categorical(df_test.Communication)
df_test.Outcome = pd.Categorical(df_test.Outcome)

In [5]:
## Call duration for train

tmp_timediff = pd.to_datetime(df_train.CallEnd)-pd.to_datetime(df_train.CallStart)
df_train['CallDuration'] = (tmp_timediff / pd.Timedelta(minutes = 1)).round(0).apply(int)


In [6]:
## Call duration for test

tmp_timediff = pd.to_datetime(df_test.CallEnd)-pd.to_datetime(df_test.CallStart)
df_test['CallDuration'] = (tmp_timediff / pd.Timedelta(minutes = 1)).round(0).apply(int)


In [7]:
## DaysPassed refactoring for Train

df_train['PrevContacted'] = pd.Categorical(np.where(df_train.DaysPassed == -1, 'no', 'yes'))

df_train.DaysPassed = np.where(df_train.DaysPassed == -1, 0, df_train.DaysPassed)

In [8]:
## DaysPassed refactoring for Test

df_test['PrevContacted'] = pd.Categorical(np.where(df_test.DaysPassed == -1, 'no', 'yes'))

df_test.DaysPassed = np.where(df_test.DaysPassed == -1, 0, df_test.DaysPassed)

In [9]:
## Drop unused columns for Train

df_train.drop(['Id', 'LastContactMonth', 'LastContactDay', 'CallEnd', 'CallStart'], axis = 1, inplace = True)

In [10]:
## Drop unused columns for Test

df_test.drop(['Id', 'LastContactMonth', 'LastContactDay', 'CallEnd', 'CallStart'], axis = 1, inplace = True)

In [32]:
df_train.dtypes

Age                 int64
Job              category
Marital          category
Education        category
Default          category
Balance             int64
HHInsurance      category
CarLoan          category
Communication    category
NoOfContacts        int64
DaysPassed          int64
PrevAttempts        int64
Outcome          category
CarInsurance     category
CallDuration        int64
PrevContacted    category
dtype: object

In [33]:
df_train.dropna().shape

(907, 16)

In [34]:
df_train.shape

(4000, 16)

Se pierden demasiados datos al eliminar todos los `nan`, por lo que al momento de binarizar voy a crear una nueva columna para estos.

## Codificación de varibales categoricas

In [11]:
X = pd.get_dummies(df_train.drop('CarInsurance', axis = 1), dummy_na = True, drop_first = True)
Y = df_train.CarInsurance

In [12]:
X_test = pd.get_dummies(df_test.drop('CarInsurance', axis = 1), dummy_na = True, drop_first = True)

In [13]:
X.shape

(4000, 36)

In [14]:
X_test.shape

(1000, 36)

----
# Modelamiento

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = .3, random_state = 11235813)

# Decision tree

In [43]:
%%time
tree_params = {
    'criterion': ['gini', 'entropy'],
    'min_samples_split': np.linspace(.1, 1.0, 10).round(2),
    'min_samples_leaf': np.linspace(.1,.5, 10).round(2),
    'max_features': [.1, .3, .6, .8, 1.0]
}


grid = GridSearchCV(DecisionTreeClassifier(random_state = 11235813),
                    param_grid = tree_params,
                    cv=10,
                    n_jobs = -1).fit(x_train, y_train)        

CPU times: user 14.8 s, sys: 1.16 s, total: 16 s
Wall time: 56.1 s


In [44]:
grid.best_score_

0.7571428571428571

In [45]:
tree_preds = grid.best_estimator_.predict(x_test)
print(classification_report(y_test, tree_preds))

              precision    recall  f1-score   support

          no       0.73      0.79      0.76       702
         yes       0.67      0.59      0.63       498

    accuracy                           0.71      1200
   macro avg       0.70      0.69      0.69      1200
weighted avg       0.71      0.71      0.71      1200



In [47]:
#dump(grid.best_estimator_, 'tree_clf.joblib')

['tree_clf.joblib']

------

# Random Forest Regressor

## Ajustando hiperparámetros:

> * `max_features`: ['None', 'log', 'sqrt']
> * `n_estimators`: Rango de $100$ a $1000$ en pasos de $100$

Reportaremos la tasa de OOB media



In [49]:
%%time
parametros = {
    'max_features': [None, 'log2', 'sqrt'],
    'n_estimators': range(100,1000, 100)
}

grid_rf = GridSearchCV(RandomForestClassifier(oob_score = True, random_state = seed),
                    param_grid = parametros,
                    n_jobs = -1,).fit(x_train, y_train)



CPU times: user 2.8 s, sys: 245 ms, total: 3.05 s
Wall time: 1min 29s


In [50]:
print(classification_report(y_test, grid_rf.best_estimator_.predict(x_test)))

              precision    recall  f1-score   support

          no       0.84      0.84      0.84       702
         yes       0.78      0.78      0.78       498

    accuracy                           0.82      1200
   macro avg       0.81      0.81      0.81      1200
weighted avg       0.82      0.82      0.82      1200



In [51]:
#dump(grid_rf.best_estimator_, 'rf_clf.joblib')

['rf_clf.joblib']

-----
## Gradient Boost Classifier

In [52]:
gradient_params = {
    'learning_rate':[0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 500, 1000, 2000],
    'subsample': [0.1, 0.5, 0.9]
}

gradient_grid = GridSearchCV(GradientBoostingClassifier(),
                             cv = 5,
                            param_grid=gradient_params,
                            n_jobs = -1).fit(x_train, y_train)

In [53]:
print(classification_report(y_test, gradient_grid.best_estimator_.predict(x_test)))

              precision    recall  f1-score   support

          no       0.85      0.84      0.85       702
         yes       0.78      0.80      0.79       498

    accuracy                           0.82      1200
   macro avg       0.82      0.82      0.82      1200
weighted avg       0.82      0.82      0.82      1200



In [54]:
#dump(gradient_grid.best_estimator_, 'gradientBoosting_clf.joblib')

['gradientBoosting_clf.joblib']

-----
## Adaboost

In [55]:
adaboost_params = {
    'learning_rate':[0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 500, 1000, 2000],
}

adaboost_grid = GridSearchCV(AdaBoostClassifier(),
                             cv = 5,
                            param_grid=adaboost_params,
                            n_jobs = -1).fit(x_train, y_train)

In [56]:
print(classification_report(y_test, adaboost_grid.best_estimator_.predict(x_test)))

              precision    recall  f1-score   support

          no       0.81      0.87      0.84       702
         yes       0.80      0.70      0.75       498

    accuracy                           0.80      1200
   macro avg       0.80      0.79      0.79      1200
weighted avg       0.80      0.80      0.80      1200



In [57]:
#dump(adaboost_grid.best_estimator_, 'adaboost_clf.joblib')

['adaboost_clf.joblib']

----
Podemos obtener la probabilidad de asignación de cada clase a un registro usando el método `predict_proba` de un estimador ya entrenado:

In [56]:
adaboost_grid.best_estimator_.predict_proba(X_test)

array([[0.60051267, 0.39948733],
       [0.5827035 , 0.4172965 ],
       [0.56017683, 0.43982317],
       ...,
       [0.56507238, 0.43492762],
       [0.53699355, 0.46300645],
       [0.44107079, 0.55892921]])

In [58]:
tmp_pr = adaboost_grid.best_estimator_.predict_proba(X_test)

predicted_proba = X_test.filter(regex = 'Job_*', axis = 1)

In [60]:
predicted_proba['pr_0'] = [i[0] for i in tmp_pr]
predicted_proba['pr_1'] = [i[1] for i in tmp_pr]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [68]:
# generamos tres listas vacios
p0, p1, job = [], [], []

# para cada estado ingresado en el nuevo conjunto de datos
for colname, serie in predicted_proba.loc[:, :'Job_nan'].iteritems():
    # seleccionamos los registros existentes
    tmp_df = predicted_proba[serie == 1]
    # calculamos la media de no comprar y agregamos a la lista vacía
    p0.append(tmp_df['pr_0'].mean())
    
    # calculamos la media de comprar y agregamos a la lista vacía
    p1.append(tmp_df['pr_1'].mean())
    
    # agregamos el nombre del estado
    job.append(colname)

# convertimos a dataframe
store_pr = pd.DataFrame({'job': job, 'Prob. Not buying':p0, 'Prob. Buying': p1}).set_index('job')

In [69]:
store_pr

Unnamed: 0_level_0,Prob. Not buying,Prob. Buying
job,Unnamed: 1_level_1,Unnamed: 2_level_1
Job_blue-collar,0.525554,0.474446
Job_entrepreneur,0.517601,0.482399
Job_housemaid,0.525102,0.474898
Job_management,0.51606,0.48394
Job_retired,0.49105,0.50895
Job_self-employed,0.518734,0.481266
Job_services,0.523736,0.476264
Job_student,0.495907,0.504093
Job_technician,0.519104,0.480896
Job_unemployed,0.507951,0.492049


In [66]:
df_train.CarInsurance.value_counts()

no     2396
yes    1604
Name: CarInsurance, dtype: int64

-----
# SVM

In [18]:
%%time
svm_params = {
    'C': [0.01, 0.1, 1.0, 1e1],
    'kernel': ['rbf', 'linear']
}

grid_svm = GridSearchCV(SVC(gamma = 'auto', random_state = seed),
                        param_grid=svm_params,
                        cv = 5,
                        n_jobs = -1).fit(x_train, y_train)

KeyboardInterrupt: 

In [41]:
svm = SVC(gamma = 'auto', random_state = seed).fit(x_train, y_train)

In [43]:
print(classification_report(y_test, svm.predict(x_test)))

              precision    recall  f1-score   support

          no       0.60      0.91      0.73       702
         yes       0.55      0.15      0.23       498

    accuracy                           0.60      1200
   macro avg       0.58      0.53      0.48      1200
weighted avg       0.58      0.60      0.52      1200



In [None]:
dump(svm_grid.best_estimator_, 'svm_clf.joblib')

-------
# Logistic Regression

In [27]:
logit_params = {
    'C':[1e-2, 1e-1, 1e0, 1e1, 1e2],
    'l1_ratio': [0, 1e-2, 1e-1, 1e0]   
}

logit_grid = GridSearchCV(LogisticRegression(random_state = seed, penalty = 'elasticnet', solver = 'saga'), 
                           cv = 5, 
                           param_grid = logit_params,
                           n_jobs = -1).fit(x_train, y_train)



In [28]:
print(classification_report(y_test, logit_grid.best_estimator_.predict(x_test)))

              precision    recall  f1-score   support

          no       0.59      0.91      0.72       702
         yes       0.46      0.11      0.18       498

    accuracy                           0.58      1200
   macro avg       0.53      0.51      0.45      1200
weighted avg       0.54      0.58      0.49      1200



In [29]:
dump(logit_grid.best_estimator_, 'logit_clf.joblib')

['logit_clf.joblib']

------
# Quadratic Discriminant Analysis

In [31]:
qda_params = {
    
}

qda_grid = GridSearchCV(QuadraticDiscriminantAnalysis(),
                       cv = 5,
                       n_jobs = -1,
                       ).fit(x_train, y_train)

TypeError: __init__() missing 1 required positional argument: 'param_grid'

In [None]:
print(classification_report(y_test, qda_grid.best_estimator_.predict(x_test)))


In [None]:
dump(qda_grid.best_estimator_, 'qda_clf.joblib')

------

# Voting Classifier

In [None]:
def plot_importance(estimator):
    tmp_df = pd.DataFrame([estimator.get_features(), estimator.feature_importance_]).T
    tmp_df.columns = ['Feature', 'Importance']
    sn.bar()

In [32]:
tree_clf = load('tree_clf.joblib')
rf_clf = load('rf_clf.joblib')
gradientboost_clf = load('gradientBoosting_clf.joblib')
adaboost_clf = load('adaboost_clf.joblib')
#logistic_clf = load('logit_clf.joblib')

In [56]:
models = [tree_clf, rf_clf, gradientboost_clf, adaboost_clf, svm]
voting_clf = EnsembleVoteClassifier(models, weights=[.1, .1, .35, .35, .1], voting = 'hard', refit = False)
voting_clf.fit(x_train, y_train);

In [57]:
voting_preds = voting_clf.predict(x_test)

In [58]:
print(classification_report(y_test, voting_preds))

              precision    recall  f1-score   support

          no       0.81      0.87      0.84       702
         yes       0.79      0.71      0.75       498

    accuracy                           0.80      1200
   macro avg       0.80      0.79      0.79      1200
weighted avg       0.80      0.80      0.80      1200

