In [76]:
import pandas as pd
import numpy as np

from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# librerías para crear el modelo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import tree

# para calcular las métricas
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [3]:
df_yesno_scaled = pd.read_csv('data/df_yesno_scaled.csv')

In [47]:
df_yesno_scaled[df_yesno_scaled['Offer Accepted']== 'No'].shape

(16469, 14)

# Imbalance
There is a great imbalance between the Yes and No in 'Offer Accepted' column. To deal with this we'll combine upsampling and downsampling techniques that generate synthetic data points of minority class samples in the neighborhood of minority class samples and then removing samples which do not agree “enough” with their neighboorhood.

In [4]:
df_yesno_scaled.columns

Index(['Offer Accepted', '# Credit Cards Held', 'Household Size',
       'Reward_Air Miles', 'Reward_Cash Back', 'Reward_Points',
       'Mailer Type_Letter', 'Mailer Type_Postcard', 'Income Level_High',
       'Income Level_Low', 'Income Level_Medium', 'Credit Rating_High',
       'Credit Rating_Low', 'Credit Rating_Medium'],
      dtype='object')

In [5]:
X = df_yesno_scaled[['# Credit Cards Held', 'Household Size',
       'Reward_Air Miles', 'Reward_Cash Back', 'Reward_Points',
       'Mailer Type_Letter', 'Mailer Type_Postcard', 'Income Level_High',
       'Income Level_Low', 'Income Level_Medium', 'Credit Rating_High',
       'Credit Rating_Low', 'Credit Rating_Medium']]
y = df_yesno_scaled['Offer Accepted']

In [54]:
smt = SMOTETomek()
X_resampled, y_resampled = smt.fit_resample(X, y)

In [55]:
y_resampled.value_counts()

Yes    16469
No     16469
Name: Offer Accepted, dtype: int64

# Model

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2, 
                                                    random_state = 80)

In [61]:
y_train.describe()

count     26350
unique        2
top          No
freq      13219
Name: Offer Accepted, dtype: object

In [62]:
y_test.describe()

count     6588
unique       2
top        Yes
freq      3338
Name: Offer Accepted, dtype: object

In [80]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

### Logistic Regression

In [63]:
lr = LogisticRegression(max_iter = 100000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=100000)

In [64]:
y_pred_lr_test = lr.predict(X_test)
y_pred_lr_train = lr.predict(X_train)

Let's check the accuracy of our model on the test set.

In [67]:
def matriz_confusion(clases_reales, clases_predichas, total_filas_test):
    mat_lr = confusion_matrix(clases_reales, clases_predichas)

    df = pd.DataFrame(mat_lr, columns = ["Yes", "No"], index = ["Yes", "No"])
    df = ((df / total_filas_test) * 100).round(2)
    return df

In [68]:
matriz_logistica = matriz_confusion(y_test, y_pred_lr_test, y_test.shape[0])
matriz_logistica

Unnamed: 0,Yes,No
Yes,35.53,13.8
No,13.52,37.14


In [69]:
accuracy = accuracy_score(y_test, y_pred_lr_test)
accuracy

0.726775956284153

In [70]:
precision = precision_score(y_test, y_pred_lr_test)
precision

ValueError: pos_label=1 is not a valid label. It should be one of ['No', 'Yes']

In [71]:
recall = recall_score(y_test, y_pred_lr_test)
recall

ValueError: pos_label=1 is not a valid label. It should be one of ['No', 'Yes']

In [72]:
f1 = f1_score(y_test, y_pred_lr_test)
f1

ValueError: pos_label=1 is not a valid label. It should be one of ['No', 'Yes']

In [73]:
kappa = cohen_kappa_score(y_test, y_pred_lr_test)
kappa

0.45341449087678243

### Decision Tree

In [83]:
arbol = DecisionTreeClassifier()
arbol.fit(X_train, y_train)

DecisionTreeClassifier()

In [77]:
max_features = np.sqrt(len(X_train.columns))
max_features

3.605551275463989

In [86]:
y_pred_train_dt = arbol.predict(X_train)
y_pred_test_dt = arbol.predict(X_test)

In [None]:
results_decission_tree1 = metricas(y_test, y_pred_test_dt,y_train, y_pred_train_dt, "Decission Tree I" )
results_decission_tree1

In [92]:
accuracy = accuracy_score(y_test, y_pred_test_dt)
accuracy

0.8539769277474195

In [96]:
precision = precision_score(y_test, y_pred_test_dt)
precision

ValueError: pos_label=1 is not a valid label. It should be one of ['No', 'Yes']

In [91]:
kappa = cohen_kappa_score(y_test, y_pred_test_dt)
kappa

0.7077761043232302

In [95]:
kappa = cohen_kappa_score(y_train, y_pred_train_dt)
kappa

0.7291496964887612

In [97]:
matriz_logistica = matriz_confusion(y_test, y_pred_test_dt, y_test.shape[0])
matriz_logistica

Unnamed: 0,Yes,No
Yes,41.23,8.11
No,6.5,44.17


In [94]:
print(arbol.tree_.max_depth)

21


In [99]:
param = {"max_depth": [2,3, 4,6,8,10],
         "min_samples_split": [10, 50, 100],
         "max_features": [1,2,3,4]}

arbol2 = GridSearchCV(
        estimator = DecisionTreeClassifier(), 
        param_grid = param, 
        verbose = 3, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [None]:
arbol2.fit(X_train, y_train)

In [101]:
arbol2.best_params_

{'max_depth': 2, 'max_features': 1, 'min_samples_split': 10}

In [102]:
arbol3 = DecisionTreeClassifier( max_depth = 2, max_features = 1, min_samples_split=10)
arbol3.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=2, max_features=1, min_samples_split=10)

In [103]:
y_pred_train_dt = arbol3.predict(X_train)
y_pred_test_dt = arbol3.predict(X_test)

In [104]:
accuracy = accuracy_score(y_test, y_pred_test_dt)
accuracy

0.660595021250759

In [105]:
kappa = cohen_kappa_score(y_test, y_pred_test_dt)
kappa

0.321520201891796

In [106]:
kappa = cohen_kappa_score(y_train, y_pred_train_dt)
kappa

0.3169324970371502

In [107]:
matriz_logistica = matriz_confusion(y_test, y_pred_test_dt, y_test.shape[0])
matriz_logistica

Unnamed: 0,Yes,No
Yes,33.61,15.73
No,18.21,32.45


### Random Forest

In [108]:
random_forest = GridSearchCV(
        estimator = RandomForestClassifier(), 
        param_grid = param, 
        verbose = 3, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [None]:
random_forest.fit(X_train, y_train)

In [110]:
random_forest.best_params_

{'max_depth': 2, 'max_features': 1, 'min_samples_split': 10}

In [111]:
clf=RandomForestClassifier(min_samples_split= 10,
                           max_features=1,
                           max_depth=2)
clf.fit(X_train,y_train)

RandomForestClassifier(max_depth=2, max_features=1, min_samples_split=10)

In [112]:
y_pred_clf_test= clf.predict(X_test)
y_pred_clf_train= clf.predict(X_train)

In [113]:
matriz_logistica = matriz_confusion(y_test, y_pred_clf_test, y_test.shape[0])
matriz_logistica

Unnamed: 0,Yes,No
Yes,33.08,16.26
No,13.25,37.42


In [114]:
accuracy = accuracy_score(y_test, y_pred_test_dt)
accuracy

0.660595021250759

In [115]:
kappa = cohen_kappa_score(y_test, y_pred_test_dt)
kappa

0.321520201891796

In [116]:
kappa = cohen_kappa_score(y_train, y_pred_train_dt)
kappa

0.3169324970371502