In [9]:
import pandas as pd
import numpy as np

from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# librerías para crear el modelo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import tree

# para calcular las métricas
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [10]:
df_yesno_scaled = pd.read_csv('data/df_yesno_scaled2.csv')

In [11]:
df_yesno_scaled[df_yesno_scaled['Offer Accepted']== 'No'].shape

(0, 18)

# Imbalance
There is a great imbalance between the Yes and No in 'Offer Accepted' column. To deal with this we'll combine upsampling and downsampling techniques that generate synthetic data points of minority class samples in the neighborhood of minority class samples and then removing samples which do not agree “enough” with their neighboorhood.

In [12]:
df_yesno_scaled.columns

Index(['Offer Accepted', 'Reward_Air Miles', 'Reward_Cash Back',
       'Reward_Points', 'Mailer Type_Letter', 'Mailer Type_Postcard',
       'Income Level_High', 'Income Level_Low', 'Income Level_Medium',
       'Overdraft Protection_No', 'Overdraft Protection_Yes',
       'Credit Rating_High', 'Credit Rating_Low', 'Credit Rating_Medium',
       'Own Your Home_No', 'Own Your Home_Yes', 'PC1', 'PC2'],
      dtype='object')

In [13]:
X = df_yesno_scaled[['Reward_Air Miles', 'Reward_Cash Back',
       'Reward_Points', 'Mailer Type_Letter', 'Mailer Type_Postcard',
       'Income Level_High', 'Income Level_Low', 'Income Level_Medium',
       'Overdraft Protection_No', 'Overdraft Protection_Yes',
       'Credit Rating_High', 'Credit Rating_Low', 'Credit Rating_Medium',
       'Own Your Home_No', 'Own Your Home_Yes', 'PC1', 'PC2']]
y = df_yesno_scaled['Offer Accepted']

In [14]:
smt = SMOTETomek()
X_resampled, y_resampled = smt.fit_resample(X, y)

In [15]:
y_resampled.value_counts()

1    15970
0    15970
Name: Offer Accepted, dtype: int64

# Model

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2, 
                                                    random_state = 80)

In [17]:
y_train.describe()

count    25552.000000
mean         0.499100
std          0.500009
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: Offer Accepted, dtype: float64

In [18]:
y_test.describe()

count    6388.000000
mean        0.503601
std         0.500026
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Offer Accepted, dtype: float64

In [19]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

### Logistic Regression

In [20]:
lr = LogisticRegression(max_iter = 100000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=100000)

In [21]:
y_pred_lr_test = lr.predict(X_test)
y_pred_lr_train = lr.predict(X_train)

Let's check the accuracy of our model on the test set.

In [22]:
def matriz_confusion(clases_reales, clases_predichas, total_filas_test):
    mat_lr = confusion_matrix(clases_reales, clases_predichas)

    df = pd.DataFrame(mat_lr, columns = ["No", "Yes"], index = ["No", "Yes"])
    df = ((df / total_filas_test) * 100).round(2)
    return df

In [23]:
results_logistic_regression = metricas(y_test, y_pred_lr_test, y_train, y_pred_lr_train, "Logistic Regression" )
results_logistic_regression

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.82154,0.831683,0.80945,0.820416,0.643131,test,Logistic Regression
1,0.831833,0.836785,0.823728,0.830205,0.663656,train,Logistic Regression


In [24]:
matriz_logistica = matriz_confusion(y_test, y_pred_lr_test, y_test.shape[0])
matriz_logistica

Unnamed: 0,No,Yes
No,41.39,8.25
Yes,9.6,40.76


### Decision Tree

In [25]:
arbol = DecisionTreeClassifier()
arbol.fit(X_train, y_train)

DecisionTreeClassifier()

In [26]:
max_features = np.sqrt(len(X_train.columns))
max_features

4.123105625617661

In [27]:
y_pred_train_dt = arbol.predict(X_train)
y_pred_test_dt = arbol.predict(X_test)

In [28]:
results_decission_tree1 = metricas(y_test, y_pred_test_dt,y_train, y_pred_train_dt, "Decission Tree I" )
results_decission_tree1

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.90263,0.894977,0.913895,0.904337,0.80522,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I


In [29]:
matriz_logistica = matriz_confusion(y_test, y_pred_test_dt, y_test.shape[0])
matriz_logistica

Unnamed: 0,No,Yes
No,44.24,5.4
Yes,4.34,46.02


In [30]:
print(arbol.tree_.max_depth)

31


In [31]:
param = {"max_depth": [15,16,17,18,19,20,21],
         "min_samples_split": [10, 50, 100],
         "max_features": [10, 11, 12, 13, 14, 15]}

arbol2 = GridSearchCV(
        estimator = DecisionTreeClassifier(), 
        param_grid = param, 
        verbose = 3, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [32]:
arbol2.fit(X_train, y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.077, test=-0.121) total time=   0.0s
[CV 2/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.074, test=-0.125) total time=   0.0s
[CV 3/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.073, test=-0.116) total time=   0.0s
[CV 4/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.082, test=-0.119) total time=   0.0s
[CV 5/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.076, test=-0.125) total time=   0.0s
[CV 6/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.074, test=-0.116) total time=   0.0s
[CV 7/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.075, test=-0.132) total time=   0.0s
[CV 8/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.071,

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [16, 17, 18, 19, 20, 21],
                         'max_features': [10, 11, 12, 13, 14, 15],
                         'min_samples_split': [10, 50, 100]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [33]:
arbol2.best_params_

{'max_depth': 21, 'max_features': 10, 'min_samples_split': 10}

In [116]:
arbol3 = DecisionTreeClassifier( max_depth = 10, max_features = 10, min_samples_split=10)
arbol3.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=10, max_features=10, min_samples_split=10)

In [117]:
y_pred_train_dt = arbol3.predict(X_train)
y_pred_test_dt = arbol3.predict(X_test)

In [118]:
results_decission_tree2 = metricas(y_test, y_pred_test_dt, y_train, y_pred_train_dt, "Decission Tree II" )
results_decission_tree2

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.827332,0.787697,0.899596,0.839936,0.654291,test,Decission Tree II
1,0.841265,0.798435,0.912256,0.851559,0.68261,train,Decission Tree II


In [37]:
matriz_logistica = matriz_confusion(y_test, y_pred_test_dt, y_test.shape[0])
matriz_logistica

Unnamed: 0,No,Yes
No,43.35,6.29
Yes,4.56,45.8


### Random Forest

In [38]:
random_forest = GridSearchCV(
        estimator = RandomForestClassifier(), 
        param_grid = param, 
        verbose = 3, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [39]:
random_forest.fit(X_train, y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.053, test=-0.088) total time=   3.2s
[CV 2/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.052, test=-0.095) total time=   3.2s
[CV 3/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.053, test=-0.085) total time=   3.0s
[CV 4/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.055, test=-0.088) total time=   2.9s
[CV 5/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.053, test=-0.090) total time=   3.2s
[CV 6/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.052, test=-0.092) total time=   3.5s
[CV 7/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.054, test=-0.091) total time=   3.6s
[CV 8/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.053,

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [16, 17, 18, 19, 20, 21],
                         'max_features': [10, 11, 12, 13, 14, 15],
                         'min_samples_split': [10, 50, 100]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [40]:
random_forest.best_params_

{'max_depth': 21, 'max_features': 10, 'min_samples_split': 10}

In [88]:
clf=RandomForestClassifier(min_samples_split= 10,
                           max_features=10,
                           max_depth=9)
clf.fit(X_train,y_train)

RandomForestClassifier(max_depth=9, max_features=10, min_samples_split=10)

In [89]:
y_pred_clf_test= clf.predict(X_test)
y_pred_clf_train= clf.predict(X_train)

In [90]:
results_random_forest = metricas(y_test, y_pred_clf_test, y_train, y_pred_clf_train, "Random Forest" )
results_random_forest

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.834377,0.805375,0.884986,0.843306,0.668499,test,Random Forest
1,0.852771,0.81795,0.906845,0.860107,0.705598,train,Random Forest


In [44]:
matriz_logistica = matriz_confusion(y_test, y_pred_clf_test, y_test.shape[0])
matriz_logistica

Unnamed: 0,No,Yes
No,40.01,9.63
Yes,3.9,46.46


In [45]:
df_all = pd.concat([results_logistic_regression, results_decission_tree1, results_decission_tree2, results_random_forest])

In [46]:
df_all

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.82154,0.831683,0.80945,0.820416,0.643131,test,Logistic Regression
1,0.831833,0.836785,0.823728,0.830205,0.663656,train,Logistic Regression
0,0.90263,0.894977,0.913895,0.904337,0.80522,test,Decission Tree I
1,1.0,1.0,1.0,1.0,1.0,train,Decission Tree I
0,0.891515,0.879207,0.909543,0.894118,0.782965,test,Decission Tree II
1,0.948145,0.934591,0.963538,0.948844,0.896295,train,Decission Tree II
0,0.864746,0.828356,0.922599,0.872941,0.729255,test,Random Forest
1,0.884275,0.844348,0.941739,0.890388,0.768598,train,Random Forest
