In [212]:
import pandas as pd
import numpy as np

from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

# librerías para crear el modelo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import tree

# para calcular las métricas
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score 
from sklearn.metrics import cohen_kappa_score


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [250]:
df_yesno_scaled = pd.read_csv('data/df_yesno_scaled.csv')

In [251]:
df_yesno_scaled[df_yesno_scaled['Offer Accepted']== 'No'].shape

(0, 14)

# Imbalance
There is a great imbalance between the Yes and No in 'Offer Accepted' column. To deal with this we'll combine upsampling and downsampling techniques that generate synthetic data points of minority class samples in the neighborhood of minority class samples and then removing samples which do not agree “enough” with their neighboorhood.

In [252]:
df_yesno_scaled.columns

Index(['Offer Accepted', '# Credit Cards Held', 'Household Size',
       'Reward_Air Miles', 'Reward_Cash Back', 'Reward_Points',
       'Mailer Type_Letter', 'Mailer Type_Postcard', 'Income Level_High',
       'Income Level_Low', 'Income Level_Medium', 'Credit Rating_High',
       'Credit Rating_Low', 'Credit Rating_Medium'],
      dtype='object')

In [253]:
X = df_yesno_scaled[['# Credit Cards Held', 'Household Size',
       'Reward_Air Miles', 'Reward_Cash Back', 'Reward_Points',
       'Mailer Type_Letter', 'Mailer Type_Postcard', 'Income Level_High',
       'Income Level_Low', 'Income Level_Medium', 'Credit Rating_High',
       'Credit Rating_Low', 'Credit Rating_Medium']]
y = df_yesno_scaled['Offer Accepted']

In [254]:
smt = SMOTETomek()
X_resampled, y_resampled = smt.fit_resample(X, y)

In [255]:
y_resampled.value_counts()

1    16469
0    16469
Name: Offer Accepted, dtype: int64

# Model

In [256]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2, 
                                                    random_state = 80)

In [220]:
y_train.describe()

count    26350.000000
mean         0.498330
std          0.500007
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: Offer Accepted, dtype: float64

In [221]:
y_test.describe()

count    6588.000000
mean        0.506679
std         0.499993
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Offer Accepted, dtype: float64

In [222]:
def metricas(clases_reales_test, clases_predichas_test, clases_reales_train, clases_predichas_train, modelo):
    
    # para el test
    accuracy_test = accuracy_score(clases_reales_test, clases_predichas_test)
    precision_test = precision_score(clases_reales_test, clases_predichas_test)
    recall_test = recall_score(clases_reales_test, clases_predichas_test)
    f1_test = f1_score(clases_reales_test, clases_predichas_test)
    kappa_test = cohen_kappa_score(clases_reales_test, clases_predichas_test)

    # para el train
    accuracy_train = accuracy_score(clases_reales_train, clases_predichas_train)
    precision_train = precision_score(clases_reales_train, clases_predichas_train)
    recall_train = recall_score(clases_reales_train, clases_predichas_train)
    f1_train = f1_score(clases_reales_train, clases_predichas_train)
    kappa_train = cohen_kappa_score(clases_reales_train, clases_predichas_train)
    

    
    df = pd.DataFrame({"accuracy": [accuracy_test, accuracy_train], 
                       "precision": [precision_test, precision_train],
                       "recall": [recall_test, recall_train], 
                       "f1": [f1_test, f1_train],
                       "kapppa": [kappa_test, kappa_train],
                       "set": ["test", "train"]})
    
    df["modelo"] = modelo
    return df

### Logistic Regression

In [223]:
lr = LogisticRegression(max_iter = 100000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=100000)

In [224]:
y_pred_lr_test = lr.predict(X_test)
y_pred_lr_train = lr.predict(X_train)

Let's check the accuracy of our model on the test set.

In [261]:
def matriz_confusion(clases_reales, clases_predichas, total_filas_test):
    mat_lr = confusion_matrix(clases_reales, clases_predichas)

    df = pd.DataFrame(mat_lr, columns = ["No", "Yes"], index = ["No", "Yes"])
    df = ((df / total_filas_test) * 100).round(2)
    return df

In [226]:
results_logistic_regression = metricas(y_test, y_pred_lr_test, y_train, y_pred_lr_train, "Logistic Regression" )
results_logistic_regression

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.724347,0.726084,0.732175,0.729117,0.448534,test,Logistic Regression
1,0.731044,0.725455,0.740538,0.732919,0.462119,train,Logistic Regression


In [227]:
matriz_logistica = matriz_confusion(y_test, y_pred_lr_test, y_test.shape[0])
matriz_logistica

Unnamed: 0,Yes,No
Yes,35.34,14.0
No,13.57,37.1


### Decision Tree

In [257]:
arbol = DecisionTreeClassifier()
arbol.fit(X_train, y_train)

DecisionTreeClassifier()

In [258]:
max_features = np.sqrt(len(X_train.columns))
max_features

3.605551275463989

In [259]:
y_pred_train_dt = arbol.predict(X_train)
y_pred_test_dt = arbol.predict(X_test)

In [260]:
results_decission_tree1 = metricas(y_test, y_pred_test_dt,y_train, y_pred_train_dt, "Decission Tree I" )
results_decission_tree1

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.854888,0.844618,0.874476,0.859288,0.709585,test,Decission Tree I
1,0.864554,0.844949,0.891859,0.867771,0.729155,train,Decission Tree I


In [262]:
matriz_logistica = matriz_confusion(y_test, y_pred_test_dt, y_test.shape[0])
matriz_logistica

Unnamed: 0,No,Yes
No,41.18,8.15
Yes,6.36,44.31


In [233]:
print(arbol.tree_.max_depth)

20


In [274]:
param = {"max_depth": [16, 17, 18, 19, 20, 21],
         "min_samples_split": [10, 50, 100],
         "max_features": [10, 11, 12, 13, 14, 15]}

arbol2 = GridSearchCV(
        estimator = DecisionTreeClassifier(), 
        param_grid = param, 
        verbose = 3, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [275]:
arbol2.fit(X_train, y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.140, test=-0.140) total time=   0.0s
[CV 2/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.138, test=-0.146) total time=   0.0s
[CV 3/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.138, test=-0.145) total time=   0.0s
[CV 4/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.137, test=-0.162) total time=   0.0s
[CV 5/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.138, test=-0.149) total time=   0.0s
[CV 6/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.137, test=-0.148) total time=   0.0s
[CV 7/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.137, test=-0.152) total time=   0.0s
[CV 8/10] END max_depth=16, max_features=10, min_samples_split=10;, score=(train=-0.137,

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mituc\anaconda3\envs\py39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mituc\anaconda3\envs\py39\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "c:\Users\mituc\anaconda3\envs\py39\lib\site-packages\sklearn\tree\_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

 -0.14960152 -0.16907021 -0.19301708 -0.14872865 -0.168235

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [16, 17, 18, 19, 20, 21],
                         'max_features': [10, 11, 12, 13, 14, 15],
                         'min_samples_split': [10, 50, 100]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [276]:
arbol2.best_params_

{'max_depth': 19, 'max_features': 13, 'min_samples_split': 10}

In [277]:
arbol3 = DecisionTreeClassifier( max_depth = 19, max_features = 13, min_samples_split=10)
arbol3.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=19, max_features=13, min_samples_split=10)

In [278]:
y_pred_train_dt = arbol3.predict(X_train)
y_pred_test_dt = arbol3.predict(X_test)

In [279]:
results_decission_tree2 = metricas(y_test, y_pred_test_dt, y_train, y_pred_train_dt, "Decission Tree II" )
results_decission_tree2

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.854281,0.843642,0.874476,0.858782,0.708365,test,Decission Tree II
1,0.864023,0.844246,0.89163,0.867291,0.728093,train,Decission Tree II


In [280]:
matriz_logistica = matriz_confusion(y_test, y_pred_test_dt, y_test.shape[0])
matriz_logistica

Unnamed: 0,No,Yes
No,41.12,8.21
Yes,6.36,44.31


### Random Forest

In [241]:
random_forest = GridSearchCV(
        estimator = RandomForestClassifier(), 
        param_grid = param, 
        verbose = 3, 
        cv = 10, 
        return_train_score = True, 
        scoring = "neg_mean_squared_error")

In [242]:
random_forest.fit(X_train, y_train)

Fitting 10 folds for each of 72 candidates, totalling 720 fits
[CV 1/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.298, test=-0.291) total time=   0.2s
[CV 2/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.291, test=-0.293) total time=   0.2s
[CV 3/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.294, test=-0.288) total time=   0.2s
[CV 4/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.294, test=-0.303) total time=   0.2s
[CV 5/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.297, test=-0.288) total time=   0.2s
[CV 6/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.299, test=-0.315) total time=   0.2s
[CV 7/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.286, test=-0.299) total time=   0.1s
[CV 8/10] END max_depth=2, max_features=1, min_samples_split=10;, score=(train=-0.294, test=-0.309) tota

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 3, 4, 6, 8, 10],
                         'max_features': [1, 2, 3, 4],
                         'min_samples_split': [10, 50, 100]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [243]:
random_forest.best_params_

{'max_depth': 10, 'max_features': 1, 'min_samples_split': 10}

In [244]:
clf=RandomForestClassifier(min_samples_split= 10,
                           max_features=1,
                           max_depth=10)
clf.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, max_features=1, min_samples_split=10)

In [245]:
y_pred_clf_test= clf.predict(X_test)
y_pred_clf_train= clf.predict(X_train)

In [246]:
results_random_forest = metricas(y_test, y_pred_clf_test, y_train, y_pred_clf_train, "Random Forest" )
results_random_forest

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.835914,0.792585,0.915818,0.849757,0.671077,test,Random Forest
1,0.842011,0.792728,0.924758,0.853668,0.684194,train,Random Forest


In [247]:
matriz_logistica = matriz_confusion(y_test, y_pred_clf_test, y_test.shape[0])
matriz_logistica

Unnamed: 0,Yes,No
Yes,37.19,12.14
No,4.27,46.4


In [281]:
df_all = pd.concat([results_logistic_regression, results_decission_tree1, results_decission_tree2, results_random_forest])

In [282]:
df_all

Unnamed: 0,accuracy,precision,recall,f1,kapppa,set,modelo
0,0.724347,0.726084,0.732175,0.729117,0.448534,test,Logistic Regression
1,0.731044,0.725455,0.740538,0.732919,0.462119,train,Logistic Regression
0,0.854888,0.844618,0.874476,0.859288,0.709585,test,Decission Tree I
1,0.864554,0.844949,0.891859,0.867771,0.729155,train,Decission Tree I
0,0.854281,0.843642,0.874476,0.858782,0.708365,test,Decission Tree II
1,0.864023,0.844246,0.89163,0.867291,0.728093,train,Decission Tree II
0,0.835914,0.792585,0.915818,0.849757,0.671077,test,Random Forest
1,0.842011,0.792728,0.924758,0.853668,0.684194,train,Random Forest
