# Laboratorio 04 - Clasificación con Clases Desbalanceadas
Coding Bootcamps ESPOL, Machine Learning and Predictions, Cohorte II

Instructores: Galo Castillo López


En este laboratorio exploraremos la implementación de modelos de ML en escenarios de clases desbalanceadas usando `scikit-learn` e `imblearn`, para predecir ocurrencia de churn en el sector bancario.  La variable objetivo corresponde a una variable binaria indicando si el cliente cerró su cuenta luego de cierto periodo.

La consigna en este laboratorio es contestar todos los **TODOs** a lo largo del notebook.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set_theme(color_codes=True)

In [None]:
df = pd.read_csv('bank_churn.csv')
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
df.isna().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [None]:
df.churn.value_counts(normalize=True)

churn
0    0.7963
1    0.2037
Name: proportion, dtype: float64

In [None]:
df.gender.value_counts()

gender
Male      5457
Female    4543
Name: count, dtype: int64

In [None]:
df.country.value_counts()

country
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [None]:
df_encoded = pd.get_dummies(df, columns=['country'])
df_encoded.head()

Unnamed: 0,customer_id,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain
0,15634602,619,Female,42,2,0.0,1,1,1,101348.88,1,True,False,False
1,15647311,608,Female,41,1,83807.86,1,0,1,112542.58,0,False,False,True
2,15619304,502,Female,42,8,159660.8,3,1,0,113931.57,1,True,False,False
3,15701354,699,Female,39,1,0.0,2,0,0,93826.63,0,True,False,False
4,15737888,850,Female,43,2,125510.82,1,1,1,79084.1,0,False,False,True


In [None]:
df_encoded['country_France'] = df_encoded['country_France'].replace([True], '1')
df_encoded['country_France'] = df_encoded['country_France'].replace([False], '0')

df_encoded['country_Germany'] = df_encoded['country_Germany'].replace([True], '1')
df_encoded['country_Germany'] = df_encoded['country_Germany'].replace([False], '0')

df_encoded['country_Spain'] = df_encoded['country_Spain'].replace([True], '1')
df_encoded['country_Spain'] = df_encoded['country_Spain'].replace([False], '0')

df_encoded['country_France'] = df_encoded['country_France'].replace([True], '1')
df_encoded['country_France'] = df_encoded['country_France'].replace([False], '0')

df_encoded['gender'] = df_encoded['gender'].replace(['Female'], '0')
df_encoded['gender'] = df_encoded['gender'].replace(['Male'], '1')

df_encoded['country_France'] = pd.to_numeric(df_encoded['country_France'])
df_encoded['country_Germany'] = pd.to_numeric(df_encoded['country_Germany'])
df_encoded['country_Spain'] = pd.to_numeric(df_encoded['country_Spain'])
df_encoded['gender'] = pd.to_numeric(df_encoded['gender'])

In [None]:
df_encoded.head()

Unnamed: 0,customer_id,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain
0,15634602,619,0,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,15647311,608,0,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,15619304,502,0,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,15701354,699,0,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,15737888,850,0,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [None]:
df_encoded.dtypes

customer_id           int64
credit_score          int64
gender                int64
age                   int64
tenure                int64
balance             float64
products_number       int64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
country_France        int64
country_Germany       int64
country_Spain         int64
dtype: object

In [None]:
df_processed = df_encoded.drop('customer_id', axis=1, inplace=False)

In [None]:
df_processed.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report

X = df_processed.drop('churn', axis=1)
y = df_processed['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn import svm
svm1 = svm.SVC(random_state=0)
svm2 = svm.SVC(random_state=0, class_weight='balanced')

svm1.fit(X_train_scaled, y_train)
svm2.fit(X_train_scaled, y_train)

y_pred_svm1 = svm1.predict(X_test_scaled)
y_pred_svm2 = svm2.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, y_pred_svm1))
print(classification_report(y_test, y_pred_svm2))

              precision    recall  f1-score   support

           0       0.85      0.99      0.91      1595
           1       0.88      0.29      0.44       405

    accuracy                           0.85      2000
   macro avg       0.86      0.64      0.68      2000
weighted avg       0.85      0.85      0.82      2000

              precision    recall  f1-score   support

           0       0.93      0.76      0.84      1595
           1       0.45      0.76      0.57       405

    accuracy                           0.76      2000
   macro avg       0.69      0.76      0.70      2000
weighted avg       0.83      0.76      0.78      2000



### **######### Todo #########**
Interprete los resultados en el reporte de clasificación.

* ¿Cómo difieren las métricas de clasificación para ambos modelos?
* ¿Existe alguna relación entre ciertas métricas al compararlas en ambos modelos?

### **######### Todo #########**
Implemente de forma similar dos modelos basados en Random Forest e utilice en uno de ellos el hiperparámetro `class_weight`. Imprima el resultado de `classification_report` e interprete nuevamente los resultados.


In [None]:
print(classification_report(y_test, y_pred_rf1))
print(classification_report(y_test, y_pred_rf2))

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier

xgb1 = XGBClassifier(random_state=0)

SCALE_POS_WEIGHT = df_processed['churn'].value_counts()[0] / df_processed['churn'].value_counts()[1]
xgb2 = XGBClassifier(random_state=0, scale_pos_weight=SCALE_POS_WEIGHT)

xgb1.fit(X_train_scaled, y_train)
xgb2.fit(X_train_scaled, y_train)

y_pred_xgb1 = xgb1.predict(X_test_scaled)
y_pred_xgb2 = xgb2.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, y_pred_xgb1))
print(classification_report(y_test, y_pred_xgb2))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1595
           1       0.70      0.51      0.59       405

    accuracy                           0.86      2000
   macro avg       0.79      0.73      0.75      2000
weighted avg       0.85      0.86      0.85      2000

              precision    recall  f1-score   support

           0       0.91      0.85      0.88      1595
           1       0.52      0.65      0.58       405

    accuracy                           0.81      2000
   macro avg       0.71      0.75      0.73      2000
weighted avg       0.83      0.81      0.82      2000



### **######### Todo #########**
Interprete los resultados en el reporte de clasificación.

* ¿Los resultados son lo que esperaba? ¿Por qué?

In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [None]:

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
vars(ros)

{'sampling_strategy': 'auto', 'random_state': 0, 'shrinkage': None}

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train_resampled1, y_train_resampled1 = ros.fit_resample(X_train_scaled, y_train)

In [None]:
y_train.value_counts()

churn
0    6368
1    1632
Name: count, dtype: int64

In [None]:
y_train_resampled1.value_counts()

churn
0    6368
1    6368
Name: count, dtype: int64

In [None]:
xgb3 = XGBClassifier(random_state=0)
xgb3.fit(X_train_resampled1, y_train_resampled1)
y_pred_xgb3 = xgb3.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, y_pred_xgb1))
print(classification_report(y_test, y_pred_xgb2))
print(classification_report(y_test, y_pred_xgb3))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1595
           1       0.70      0.51      0.59       405

    accuracy                           0.86      2000
   macro avg       0.79      0.73      0.75      2000
weighted avg       0.85      0.86      0.85      2000

              precision    recall  f1-score   support

           0       0.91      0.85      0.88      1595
           1       0.52      0.65      0.58       405

    accuracy                           0.81      2000
   macro avg       0.71      0.75      0.73      2000
weighted avg       0.83      0.81      0.82      2000

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1595
           1       0.56      0.65      0.60       405

    accuracy                           0.82      2000
   macro avg       0.73      0.76      0.74      2000
weighted avg       0.84      0.82      0.83      2000



### **######### Todo #########**
Interprete y compare los resultados de los tres reportes de clasificación (especialmente compare los resultados del modelo xgb3 con los otros modelos).


In [None]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()

X_train_resampled2, y_train_resampled2 = tl.fit_resample(X_train_scaled, y_train)

In [None]:
y_train.value_counts()

churn
0    6368
1    1632
Name: count, dtype: int64

In [None]:
y_train_resampled2.value_counts()

churn
0    5964
1    1632
Name: count, dtype: int64

In [None]:
xgb4 = XGBClassifier(random_state=0)
xgb4.fit(X_train_resampled2, y_train_resampled2)
y_pred_xgb4 = xgb4.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, y_pred_xgb3))
print(classification_report(y_test, y_pred_xgb4))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1595
           1       0.56      0.65      0.60       405

    accuracy                           0.82      2000
   macro avg       0.73      0.76      0.74      2000
weighted avg       0.84      0.82      0.83      2000

              precision    recall  f1-score   support

           0       0.89      0.92      0.90      1595
           1       0.63      0.56      0.59       405

    accuracy                           0.84      2000
   macro avg       0.76      0.74      0.75      2000
weighted avg       0.84      0.84      0.84      2000



### **######### Todo #########**
Interprete los resultados obtenidos.

### **######### Todo (OPCIONAL) #########**
Utilice la clase `SMOTE` de `imblearn` (https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html) para entrenar un nuevo modelo basado en XGBoost. Imprima el reporte de clasificación e interprete los resultados. Compare los resultados con los modelos anteriormente entrenados.