In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

%matplotlib inline
plt.style.use('seaborn')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Upload DATA
train_set = pd.read_csv('drive/My Drive/DATA/titanic/train.csv')
test_set = pd.read_csv('drive/My Drive/DATA/titanic/test.csv')
test_labels = pd.read_csv('drive/My Drive/DATA/titanic/gender_submission.csv')

In [5]:
train_set.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

###Type of variables
* Pclass: Categorial Attribute
* Sex: Categorical Attribute > Binary
* Age: "Numerical Attribute" > Standardize (Missing values 19.9%)
* SibSp: Numerical Attribute
* Parch: Numerical Attribute
* Fare: Numerical Attribute > could be categorical (with bins)
* Embarked: Categorial attribute (Missing values)

In [0]:
def get_target(df, y_index, label=''):
  X = df.drop(columns=[label])
  y = df.iloc[:, y_index]
  return X, y

In [0]:
X_train, y_train = get_target(train_set, y_index=1, label='Survived')

In [0]:
# Set the columns of data Frame
onehot_column = ['Sex']
ordinal_column = ['Pclass']
num_column = ['Age', 'SibSp', 'Parch', 'Fare']

In [0]:
# Pipeline for each type of category and numerical variables
catOrd_pipeline = Pipeline([('ord_impute', SimpleImputer(strategy='most_frequent')),
                            ('ord_encoder', OrdinalEncoder())])
cat1hot_pipeline = Pipeline([('ord_impute', SimpleImputer(strategy='most_frequent')),
                             ('onehot_encoder', OneHotEncoder())])
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                        ('scale', StandardScaler())])

# Column transform for each pipeline
full_pipeline = ColumnTransformer([('cat_ord', catOrd_pipeline, ordinal_column),
                                   ('cat_1hot', cat1hot_pipeline, onehot_column),
                                   ('num', num_pipeline, num_column)])

In [0]:
# transform the X matrix
X_train = full_pipeline.fit_transform(X_train)

##Selecting the best model that fit the data

In [0]:
def score_models(estimators, X, y_true, cv=3):
  """Return the score for precision|recall|f1 score
      parameters
      ----------
      estimator: list of tuples -> estimators
      X: Matrix with attributes
      y_true: target"""
  for name, estimator in estimators:
    y_predict = cross_val_predict(estimator, X, y_true, cv=cv)
    precision = precision_score(y_true, y_predict)
    recall = recall_score(y_true, y_predict)
    f1 = f1_score(y_true, y_predict)
    accuracy = cross_val_score(estimator, X, y_true, scoring="accuracy", cv=cv)
    
    print(f"{name}:")
    print(f"Precision Score: {precision}")
    print(f"Recall Score: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Mean Accuracy: {accuracy.mean()}")
    print("\n")

In [61]:
estimators = [('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
              ('Logistic Regression', LogisticRegression(solver='lbfgs',random_state=42)),
              ('Decision Tree', DecisionTreeClassifier(random_state=42))]

score_models(estimators, X_train, y_train, cv=20)

Random Forest:
Precision Score: 0.7767584097859327
Recall Score: 0.7426900584795322
F1 Score: 0.7593423019431988
Mean Accuracy: 0.8195970575318402


Logistic Regression:
Precision Score: 0.7430340557275542
Recall Score: 0.7017543859649122
F1 Score: 0.7218045112781954
Mean Accuracy: 0.7923781291172596


Decision Tree:
Precision Score: 0.7159420289855073
Recall Score: 0.7222222222222222
F1 Score: 0.7190684133915576
Mean Accuracy: 0.7837110232762408




In [58]:
cross_val_score(forest_clf, X_train, y_train, scoring='accuracy', cv=20).mean()

0.8095191040843215