<a href="https://colab.research.google.com/github/PabloJRW/titanic-classifier/blob/main/notebooks/Model_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

import matplotlib.pyplot as plt

In [21]:
# Importing the training set
df_train = pd.read_csv('../datasets/raw/train.csv')

X = df_train.drop('Survived', axis=1)
y = df_train.Survived

In [30]:
y_positive = df_train[df_train.Survived==1]['Survived'].count()
y_negative = df_train[df_train.Survived==0]['Survived'].count()

(y_positive / y_negative).round(2)

0.62

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

print(df_train.shape)
print(X_train.shape)
print(X_val.shape)


(891, 12)
(668, 11)
(223, 11)


In [23]:
# loading the preprocessing pipeline
preprocessing_pipeline = joblib.load('preprocessing_pipeline.joblib')

In [24]:
X_train_pro = preprocessing_pipeline.fit_transform(X_train)
X_train_pro

array([[ 0.51413858,  1.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.87644803,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.20866423,  0.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.23475286,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.88585626,  1.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.44206958,  1.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [25]:
# Búsqueda del modelo con mejor puntaje
# ===================================================================================

models = {'svc': SVC(), 
          'logistic': LogisticRegression(),
          'random_forest': RandomForestClassifier(),
          'knn': KNeighborsClassifier()}

n_models = len(models)
scores=[]
for name, model in models.items():
    y_train_pred = cross_val_predict(model, X_train_pro, y_train, cv=10)
    precision = np.mean(precision_score(y_train, y_train_pred)).round(2)
    recall = np.mean(recall_score(y_train, y_train_pred)).round(2)
    f1 = np.mean(f1_score(y_train, y_train_pred)).round(2)

    model_scores = np.array([name, precision, recall, f1])
    scores.append(np.array(model_scores))
          
scores_df = pd.DataFrame(scores, columns=['Model', 'Precision', 'Recall', 'F1']); scores_df.set_index('Model', inplace=True)
scores_df.sort_values(by='F1', ascending=False)

Unnamed: 0_level_0,Precision,Recall,F1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
logistic,0.74,0.68,0.71
random_forest,0.73,0.67,0.7
svc,0.83,0.59,0.69
knn,0.73,0.64,0.68


In [None]:
# Reporte de puntuaciones del modelo Random Forest
# ==============================================================================
rforest_clf = RandomForestClassifier(random_state=42)
y_pred = cross_val_predict(rforest_clf, X_prepared, y_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_scores = cross_val_predict(rforest_clf, X_prepared, y_train, cv=5, n_jobs=-1)
threshold = 0.5
y_with_thres = (y_scores > threshold)
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Defining AUC function
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, label=label)
    plt.plot([0, 1], [0,1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
# Area Under Curve
roc_auc_score(y_train, y_pred)