<a href="https://colab.research.google.com/github/PabloJRW/titanic-classifier/blob/main/notebooks/Model_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

import matplotlib.pyplot as plt

In [2]:
# Importing the training set
df_train = pd.read_csv('../datasets/raw/train.csv')

X = df_train.drop('Survived', axis=1)
y = df_train.Survived

In [3]:
y_positive = df_train[df_train.Survived==1]['Survived'].count()
y_negative = df_train[df_train.Survived==0]['Survived'].count()

(y_positive / y_negative).round(2)

0.62

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

print(df_train.shape)
print(X_train.shape)
print(X_val.shape)


(891, 12)
(668, 11)
(223, 11)


In [5]:
# loading the preprocessing pipeline
preprocessing_pipeline = joblib.load('preprocessing_pipeline.joblib')

AttributeError: Can't get attribute 'replace_values' on <module '__main__'>

In [None]:
X_train_pro = preprocessing_pipeline.fit_transform(X_train)
X_train_pro

In [None]:
# Búsqueda del modelo con mejor puntaje
# ===================================================================================

models = {'svc': SVC(), 
          'logistic': LogisticRegression(),
          'random_forest': RandomForestClassifier(),
          'knn': KNeighborsClassifier(),
          'gradient_b': GradientBoostingClassifier(),
          'naive_bayes': GaussianNB()}

n_models = len(models)
scores=[]
for name, model in models.items():
    y_train_pred = cross_val_predict(model, X_train_pro, y_train, cv=10)
    f1 = np.mean(f1_score(y_train, y_train_pred)).round(2)

    model_scores = np.array([name,f1])
    scores.append(np.array(model_scores))
          
scores_df = pd.DataFrame(scores, columns=['Model', 'F1']); scores_df.set_index('Model', inplace=True)
scores_df.sort_values(by='F1', ascending=False)

In [None]:
# Reporte de puntuaciones del modelo Random Forest
# ==============================================================================
rforest_clf = RandomForestClassifier(random_state=42)
y_pred = cross_val_predict(rforest_clf, X_train_pro, y_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_scores = cross_val_predict(rforest_clf, X_train_pro, y_train, cv=5, n_jobs=-1)
threshold = 0.5
y_with_thres = (y_scores > threshold)
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

# Defining AUC function
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, label=label)
    plt.plot([0, 1], [0,1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
# Area Under Curve
roc_auc_score(y_train, y_pred)