In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 

from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

#Models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline
plt.style.use('seaborn')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Upload DATA
train_set = pd.read_csv('drive/My Drive/DATA/titanic/train.csv')
test_set = pd.read_csv('drive/My Drive/DATA/titanic/test.csv')
test_labels = pd.read_csv('drive/My Drive/DATA/titanic/gender_submission.csv')

In [4]:
train_set.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

###Type of variables
* Pclass: Categorial Attribute
* Sex: Categorical Attribute > Binary
* Age: "Numerical Attribute" > Standardize (Missing values 19.9%)
* SibSp: Numerical Attribute
* Parch: Numerical Attribute
* Fare: Numerical Attribute > could be categorical (with bins)
* Embarked: Categorial attribute (Missing values)

In [0]:
def get_target(df, y_index, label=''):
  X = df.drop(columns=[label])
  y = df.iloc[:, y_index]
  return X, y

In [0]:
# Split X and y for training set
X_train, y_train_prepared = get_target(train_set, y_index=1, label='Survived')

In [0]:
# Split X and y for testing set
X_test = test_set
y_test_prepared = test_labels[['Survived']]

**Constructing a Pipeline to prepared the data**

In [0]:
# Set the columns of data Frame
onehot_column = ['Sex']
ordinal_column = ['Pclass']
num_column = ['Age', 'SibSp', 'Parch', 'Fare']

In [0]:
# Pipeline for each type of category and numerical variables
catOrd_pipeline = Pipeline([('ord_impute', SimpleImputer(strategy='most_frequent')),
                            ('ord_encoder', OrdinalEncoder())])
cat1hot_pipeline = Pipeline([('ord_impute', SimpleImputer(strategy='most_frequent')),
                             ('onehot_encoder', OneHotEncoder())])
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                        ('scale', StandardScaler())])

# Column transform for each pipeline
full_pipeline = ColumnTransformer([('cat_ord', catOrd_pipeline, ordinal_column),
                                   ('cat_1hot', cat1hot_pipeline, onehot_column),
                                   ('num', num_pipeline, num_column)])

In [0]:
# transform the X matrix
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.fit_transform(X_test)

##Selecting the best model that fit the data

In [0]:
# To evaluate differents defaults models performance
def score_models(estimators, X, y_true, cv=3):
  """Return the score for precision|recall|f1 score
      parameters
      ----------
      estimator: list of tuples -> estimators
      X: Matrix with attributes
      y_true: target"""
  df = pd.DataFrame(columns=['Precision Score', 'Recall Score', 'F1 Score', 'Mean Accuracy'])
  
  for name, estimator in estimators:
    y_predict = cross_val_predict(estimator, X, y_true, cv=cv)
    
    precision = precision_score(y_true, y_predict)
    recall = recall_score(y_true, y_predict)
    f1 = f1_score(y_true, y_predict)
    accuracy = cross_val_score(estimator, X, y_true, scoring="accuracy", cv=cv).mean()

    df.loc[len(df)] = [precision, recall, f1, accuracy]
    
    df.round(decimals=4)
  df.set_index(pd.Index([name[0] for name in estimators]), inplace=True)
  return df.sort_values('Mean Accuracy', ascending=False)

In [0]:
# This fuction is to evaluate Best Models performance later
def testing_scores(estimator_fitted, X_test, y_test_true):
  """
  estimator_fitted: estimator fitted with train DATA
  X_test: Test data
  y_test_true: test target
  """
  y_predict = estimator_fitted.predict(X_test)
  
  precision = precision_score(y_test_true, y_predict)
  recall = recall_score(y_test_true, y_predict)
  f1 = f1_score(y_test_true, y_predict)
  accuracy = accuracy_score(y_test_true, y_predict)
  
  print(f"{estimator_fitted.__class__.__name__}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 Score: {f1}")
  print(f"Accuracy Score: {accuracy}")

In [13]:
estimators = [('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
              ('Logistic Regression', LogisticRegression(solver='lbfgs',random_state=42)),
              ('Decision Tree', DecisionTreeClassifier(random_state=42)),
              ('Support Vector Classifier', SVC(random_state=42, gamma='scale')),
              ('Gaussan NB', GaussianNB()),
              ('K neighbors', KNeighborsClassifier())]

score_models(estimators, X_train_prepared, y_train_prepared, cv=7)

Unnamed: 0,Precision Score,Recall Score,F1 Score,Mean Accuracy
Support Vector Classifier,0.813115,0.725146,0.766615,0.830539
Random Forest,0.782609,0.736842,0.759036,0.820485
K neighbors,0.740964,0.719298,0.72997,0.79587
Gaussan NB,0.744548,0.69883,0.720965,0.792381
Logistic Regression,0.744479,0.690058,0.716237,0.790167
Decision Tree,0.706052,0.716374,0.711176,0.776765


### Logistic Regression

Setup some parameters and scoring methods for tunning hyperparameters

In [0]:
params_log_grid = [{'C': [0.03, 0.3, 0.7, 1, 2, 4, 6],
                'penalty': ['l2'],
                'random_state': [42],
                'solver': ['sag', 'saga'],
                'max_iter': [200, 300, 350, 400]},
               {'C': [0.03, 0.3, 0.7, 1, 2, 4, 6],
                'penalty': ['l1'],
                'random_state': [42],
                'solver': ['liblinear', 'saga'],
                'max_iter': [200, 300, 350, 400]},
               {'C': [0.03, 0.3, 0.7, 1, 2, 4, 6],
                'penalty': ['elasticnet'],
                'random_state': [42],
                'solver': ['saga'],
                'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
                'max_iter': [200, 300, 350, 400]}]

scoring = {'Accuracy': make_scorer(accuracy_score), 
           'Precision': make_scorer(precision_score), 
           'F1 Score': make_scorer(f1_score)}

Search for the best combination of hyperparameters

In [0]:
log_search = GridSearchCV(LogisticRegression(), 
                          param_grid=params_log_grid, 
                          scoring=scoring, 
                          cv=7, 
                          return_train_score=True,
                          refit='Accuracy')

In [0]:
log_search.fit(X_train_prepared, y_train_prepared)

In [17]:
# Best hyperparameters
log_search.best_params_

{'C': 0.3,
 'max_iter': 200,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'sag'}

In [0]:
log_best = log_search.best_estimator_

In [19]:
score_models([('Log Regression', log_best)], X_train_prepared, y_train_prepared, cv=25)

Unnamed: 0,Precision Score,Recall Score,F1 Score,Mean Accuracy
Log Regression,0.751592,0.690058,0.719512,0.793739


In [20]:
log_best.fit(X_train_prepared, y_train_prepared)

LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

###Random Forest

In [0]:
params_forest_grid = [{'n_estimators': range(5, 250, 15),
                      'random_state': [42],
                      'max_depth': range(2,8)}]

forest_grid = GridSearchCV(RandomForestClassifier(),
                           param_grid=params_forest_grid,
                           scoring=scoring,
                           refit='Accuracy',
                           cv=7,
                           return_train_score=True)

In [22]:
forest_grid.fit(X_train_prepared, y_train_prepared)

GridSearchCV(cv=7, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=Fa...
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid=

In [23]:
# Best hyperparameters
forest_grid.best_params_

{'max_depth': 7, 'n_estimators': 35, 'random_state': 42}

In [24]:
forest_best = forest_grid.best_estimator_
score_models([('Random Forest CLF', forest_best)], X_train_prepared, y_train_prepared, cv=7)

Unnamed: 0,Precision Score,Recall Score,F1 Score,Mean Accuracy
Random Forest CLF,0.837288,0.722222,0.77551,0.839556


In [25]:
forest_best.fit(X_train_prepared, y_train_prepared)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=35,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

### Support Vector Classifier

In [0]:
param_svc_grid = [{'C': [0.0001, 0.001, 0.01, 0.1, 0.7, 1, 1.2, 3, 5, 7],
                  'gamma': ['scale', 'auto'],
                  'random_state': [42],
                  'decision_function_shape': ['ovo', 'ovr']}]

svc_grid = GridSearchCV(estimator=SVC(),
                        param_grid=param_svc_grid,
                        scoring=scoring,
                        cv=7,
                        refit='Accuracy',
                        return_train_score=True)

In [0]:
svc_grid.fit(X_train_prepared, y_train_prepared)

In [29]:
# Best hyper-parameters
svc_grid.best_params_

{'C': 1,
 'decision_function_shape': 'ovo',
 'gamma': 'scale',
 'random_state': 42}

In [0]:
svc_best = svc_grid.best_estimator_

In [31]:
score_models([('SVC', svc_best)], X_train_prepared, y_train_prepared, cv=7)

Unnamed: 0,Precision Score,Recall Score,F1 Score,Mean Accuracy
SVC,0.813115,0.725146,0.766615,0.830539


In [32]:
svc_best.fit(X_train_prepared, y_train_prepared)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

### Submit

In [35]:
submit_prediction = svc_best.predict(X_test_prepared)
submit_prediction

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [40]:
path = '/content/drive/My Drive/GITHUB REPO/Titanic_ML_project/titanic_data'
submit_df = pd.read_csv(f"{path}/gender_submission.csv", )
submit_df['Survived'] = submit_prediction

submit_df.to_csv(path_or_buf=f"{path}/submission.csv", index=False)
submit_df.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
