# Índice

# 3. Entrenamiento de los modelos

## 3.1 Importación de las librerias

In [None]:
# General import and load data
import pandas as pd
import numpy as np

# Estimators
from sklearn.svm import SVC

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# Optimization
from sklearn.model_selection import GridSearchCV

## 3.2 Carga de los datos

In [None]:
url = "formated/train_exportado.csv"
df_train = pd.read_csv(url)

url = "formated/test_exportado.csv"
df_test = pd.read_csv(url)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

X_train = df_train[features].values
y_train = df_train['Accept'].values

X_test = df_test[features].values
y_test = df_test['Accept'].values

## 3.3 Modelos seleccionados

### 3.3.1 Clasificador X

#### - Definición

In [None]:
model = SVC(kernel=kernel, probability=True, gamma=gamma)

#### - Entrenamiento y evaluación

In [None]:
model.fit(X_train, y_train)

predicted = model.predict(X_test)
expected = y_test

In [None]:
metrics.accuracy_score(expected, predicted)

#### - Null accuracy

In [None]:
s_y_test = Series(y_test)
s_y_test.value_counts()

In [None]:
y_test.mean()

In [None]:
1 - y_test.mean() 

max(y_test.mean(), 1 - y_test.mean())

In [None]:
s_y_test.value_counts().head(1) / len(y_test)

#### - Matriz de confusión y F-score

In [None]:
print(metrics.confusion_matrix(expected, predicted))

In [None]:
print(classification_report(expected, predicted))

#### - ROC y AUC

In [None]:
y_pred_prob = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Titanic')
plt.xlabel('False Positive Rate (1 - Recall)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
plt.show()

In [None]:
thresholds

In [None]:
dprob = pd.DataFrame(data = {'probability':y_pred_prob, 'actual':y_test})
dprob.probability.hist(by=dprob.actual, sharex=True, sharey=True)

In [None]:
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Recall:', 1 - fpr[thresholds > threshold][-1])

In [None]:
evaluate_threshold(0.74)

In [None]:
print(roc_auc_score(expected, predicted))

#### - Optimización

In [None]:
model.get_params()

In [None]:
gammas = np.logspace(-6, -1, 10)
gs = GridSearchCV(model, param_grid=dict(gamma=gammas))
gs.fit(X_train, y_train)

In [None]:
print("Best score: ", gs.best_score_)
print("Best params: ", gs.best_params_)

#### - Entramiento y evaluación con hiperparámetros optimizados

In [None]:
model_opt = SVC(kernel=kernel, probability=True, gamma=gamma)

In [None]:
model_opt.fit(X_train, y_train)

predicted = model_opt.predict(X_test)
expected = y_test

In [None]:
metrics.accuracy_score(expected, predicted)

Hablar de la comparacion con el null accuracy

In [None]:
print(metrics.confusion_matrix(expected, predicted))

In [None]:
print(classification_report(expected, predicted))

In [None]:
y_pred_prob = model_opt.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Titanic')
plt.xlabel('False Positive Rate (1 - Recall)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
plt.show()

In [None]:
thresholds

In [None]:
dprob = pd.DataFrame(data = {'probability':y_pred_prob, 'actual':y_test})
dprob.probability.hist(by=dprob.actual, sharex=True, sharey=True)

In [None]:
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Recall:', 1 - fpr[thresholds > threshold][-1])

In [None]:
evaluate_threshold(0.74)

In [None]:
print(roc_auc_score(expected, predicted))

### 3.3.2 Clasificador Y

#### - Definición

In [None]:
model = SVC(kernel=kernel, probability=True, gamma=gamma)

#### - Entrenamiento y evaluación

In [None]:
model.fit(X_train, y_train)

predicted = model.predict(X_test)
expected = y_test

In [None]:
metrics.accuracy_score(expected, predicted)

#### - Null accuracy

In [None]:
s_y_test = Series(y_test)
s_y_test.value_counts()

In [None]:
y_test.mean()

In [None]:
1 - y_test.mean() 

max(y_test.mean(), 1 - y_test.mean())

In [None]:
s_y_test.value_counts().head(1) / len(y_test)

#### - Matriz de confusión y F-score

In [None]:
print(metrics.confusion_matrix(expected, predicted))

In [None]:
print(classification_report(expected, predicted))

#### - ROC y AUC

In [None]:
y_pred_prob = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Titanic')
plt.xlabel('False Positive Rate (1 - Recall)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
plt.show()

In [None]:
thresholds

In [None]:
dprob = pd.DataFrame(data = {'probability':y_pred_prob, 'actual':y_test})
dprob.probability.hist(by=dprob.actual, sharex=True, sharey=True)

In [None]:
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Recall:', 1 - fpr[thresholds > threshold][-1])

In [None]:
evaluate_threshold(0.74)

In [None]:
print(roc_auc_score(expected, predicted))

#### - Optimización

In [None]:
model.get_params()

In [None]:
gammas = np.logspace(-6, -1, 10)
gs = GridSearchCV(model, param_grid=dict(gamma=gammas))
gs.fit(X_train, y_train)

In [None]:
print("Best score: ", gs.best_score_)
print("Best params: ", gs.best_params_)

#### - Entramiento y evaluación con hiperparámetros optimizados

In [None]:
model_opt = SVC(kernel=kernel, probability=True, gamma=gamma)

In [None]:
model_opt.fit(X_train, y_train)

predicted = model_opt.predict(X_test)
expected = y_test

In [None]:
metrics.accuracy_score(expected, predicted)

Hablar de la comparacion con el null accuracy

In [None]:
print(metrics.confusion_matrix(expected, predicted))

In [None]:
print(classification_report(expected, predicted))

In [None]:
y_pred_prob = model_opt.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Titanic')
plt.xlabel('False Positive Rate (1 - Recall)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
plt.show()

In [None]:
thresholds

In [None]:
dprob = pd.DataFrame(data = {'probability':y_pred_prob, 'actual':y_test})
dprob.probability.hist(by=dprob.actual, sharex=True, sharey=True)

In [None]:
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Recall:', 1 - fpr[thresholds > threshold][-1])

In [None]:
evaluate_threshold(0.74)

In [None]:
print(roc_auc_score(expected, predicted))