<a href="https://www.kaggle.com/code/vicmangiltafolla/titanic-gridsearchcv-voting-classifier-0-8?scriptVersionId=144611433" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Titanic - Machine Learning from Disaster**


#  Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import RobustScaler, Normalizer, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.gridspec as gridspec
import seaborn as sns
import time
from joblib import dump, load

# Functions 

Define the functions that will help later in the notebook to simplify processes and make cleaner code

In [None]:
def countNan(df):
    for x in df:
        No_Nan_count = df[x].isna().value_counts()[0]
        Nan_count = len(df) - No_Nan_count
        if Nan_count > 0:
            print("Elements in {}\n".format(x)) 
            print("There are {} Nan values".format(Nan_count))
            print("There are {} No Nan values".format(No_Nan_count))
            print("---------------------------")
    print("___________________________") 
    
class FinalDataProcessor(BaseEstimator, TransformerMixin):
    '''Encode, scal and replace Nan values with any transformer using the fit/transform methods.
    
        The input should be an array-like of integers for the numeric transformation, numeric
        or string for the categorical one.
        '''
    def __init__(self,
                encoder=None,
                imputer=None,
                scaler=None,
                to_scal_cols=None,
                to_encod_cols=None):
        #Set transformers
        self.encoder = encoder
        self.imputer =imputer
        self.scaler = scaler
        self.to_scal_cols = to_scal_cols
        self.to_encod_cols = to_encod_cols
        
        #Set pipelines with the transformers
        num_pipeline = Pipeline([
                             ("imputer", self.imputer),
                             ("scaler", self.scaler)
        ])
        cat_pipeline = Pipeline([
                                ("imputer", self.imputer),
                                ("encoder", self.encoder)
        ])
        self._final_pipeline = ColumnTransformer([
            ("num", num_pipeline, self.to_scal_cols),
            ("cat", cat_pipeline, self.to_encod_cols)
        ])
    def fit(self, X, y=None):
        self._final_pipeline.fit(X)
        return self
    def transform(self, X, y=None):
        X = self._final_pipeline.transform(X)
        #Return a complete dataset with all the transformed values
        return pd.DataFrame(X, columns=(self._final_pipeline.get_feature_names_out()))

    
def TextProcess(row):
    #Clean the name attribute from punctuation marks 
    row = row.lower().replace('.', '')
    titles = ["master", "mr", "miss", "mrs", "ms", "sir", "lady", "dr", "rev", "fr", "capt", "col", "mme",
               "major", "mlle", "jonkheer", "countess", "uruchurtu", "dona"]
    #Replace the Name attribute with their corresponding title
    for name in str(row).split():
        if name in titles:
            return name

def predict(X_train,y_train,X_val, y_val,model):
    #Predicts and contrast different scores with the training and validation set
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    print("---Training set---\n")
    print("\t-Precision Score: {:.4f}\n".format(precision_score(y_train,y_train_pred)))
    print("\t-Accuracy Score: {:.4f}\n".format(accuracy_score(y_train,y_train_pred)))
    print("\t-F1 Score: {:.4f}\n".format(f1_score(y_train,y_train_pred)))
    print("\t-Recall Score: {:.4f}\n".format(recall_score(y_train, y_train_pred)))
    print("---Validation set---\n")
    print("\t-Precision Score: {:.4f}\n".format(precision_score(y_val,y_val_pred)))
    print("\t-Accuracy Score: {:.4f}\n".format(accuracy_score(y_val,y_val_pred)))
    print("\t-F1 Score: {:.4f}\n".format(f1_score(y_val,y_val_pred)))
    print("\t-Recall Score: {:.4f}\n".format(recall_score(y_val, y_val_pred)))

def CreateSub(origindf, preddf, model ,filename):
    #Creates a CSV file containing the predictions made by the final model to a 'Submissions' 
    preds = model.predict(preddf)
    passid = origindf["PassengerId"].copy()
    df_pred = pd.DataFrame(preds, columns=["Survived"])
    df_pred = pd.concat([passid,df_pred], axis="columns")
    df_pred.to_csv('/kaggle/working/' + filename + ".csv",index=False)

def PlotSurvCorrelation(df,column):
    #Plots the count of people who survived and didn't for each unique value in the selected column 

    #Get the names of the categories
    names = tuple(sorted(df[column].dropna().unique()))
    surv_dict = {'Survived': [], 'Not Survived': []}
    #Add the number of survivors in the dictionary for each category
    for name in names: 
        surv_dict['Survived'].append(len(df[column][df[column]==name][df["Survived"]==1]))
        surv_dict['Not Survived'].append(len(df[column][df[column]==name][df["Survived"]==0]))
    surv_dict['Survived'] = tuple(surv_dict['Survived'])
    surv_dict['Not Survived'] = tuple(surv_dict['Not Survived'])
    x = np.arange(len(names))
    multiplier = 0
    width = 0.25
    fig, ax = plt.subplots(figsize=(20,10))
    #Creates the bars setting the height as the count from before
    for surv, count in surv_dict.items():
        offset = width * multiplier
        rects = ax.bar(x + offset, count, width, label=surv)
        ax.bar_label(rects, padding=3)
        multiplier +=1
    ax.set_ylabel('Count')
    ax.set_title('Survival count by {}'.format(column))
    ax.set_xticks(x + width, names)
    ax.legend(loc='upper center')

# Dataset reading

Use some basic pandas built-in methods to gain some insights about the data

In [None]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.describe()

In [None]:
df_train.info()

In [None]:
#Counting of NaN elements in our dataset entries
countNan(df_train)
countNan(df_test)

# Data visualization

Get some graphical information about how some features correlate with each other 

### Fare vs. Survived 

In [None]:
fig , ax= plt.subplots(figsize=(20,15))
ax.scatter(df_train["PassengerId"][df_train["Survived"] == 1],
            df_train["Fare"][df_train["Survived"] == 1], label="Survived")
ax.scatter( df_train["PassengerId"][df_train["Survived"] == 0],
            df_train["Fare"][df_train["Survived"] == 0], label="Did not survived")
ax.set_ylabel("Fare", fontsize=25)
ax.legend()
plt.show()


#### Fare distribution

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(df_train["Fare"][df_train["Survived"] == 1], kde_kws={"label": "Survived"})
sns.distplot(df_train["Fare"][df_train['Survived'] == 0], kde_kws={"label": "Did not Survived"})
plt.show()

### Fare vs Pclass 

In [None]:
fig , ax= plt.subplots(figsize=(15,10))
ax.scatter(df_train["Fare"],df_train["Pclass"])
ax.set_xlabel("Fare",fontsize=20)
ax.set_ylabel("Pclass",fontsize=20)
plt.show()

Now compare the unique values in each attribute and see how do they relate with the survival of the passenger

### Pclass vs. Survived

In [None]:
PlotSurvCorrelation(df_train,"Pclass")

### SibSp vs. Survived

In [None]:
PlotSurvCorrelation(df_train,"SibSp")

### Parch vs Survived 

In [None]:
PlotSurvCorrelation(df_train,"Parch")

### Age vs Survived 

In [None]:
fig , ax= plt.subplots(figsize=(20,15))
ax.scatter(df_train["PassengerId"][df_train["Survived"] == 1],
            df_train["Age"][df_train["Survived"] == 1], label="Survived")
ax.scatter( df_train["PassengerId"][df_train["Survived"] == 0],
            df_train["Age"][df_train["Survived"] == 0], label="Did not survived")
ax.set_ylabel("Age", fontsize=25)
ax.legend()
plt.show()


In [None]:
PlotSurvCorrelation(df_train.head(400),"Age")

In [None]:
PlotSurvCorrelation(df_train, "Embarked")

# Dataset processing and cleaning

Use the custom transformer to process the dataset in a simple way 
###### 
Separing the label and the rest of the features in 'y' and 'X' datasets

In [None]:
#Set an instance of the data processor with the transformers and the columns
processor = FinalDataProcessor(imputer=SimpleImputer(strategy="most_frequent"),
                              scaler=Normalizer(),
                              encoder=OneHotEncoder(handle_unknown="ignore"),
                              to_encod_cols=["Pclass", "Sex", 'Embarked'],
                              to_scal_cols = ["Age", "SibSp", "Parch", "Fare"])

In [None]:
#Transform the datasets and saving the label as 'y'
X = processor.fit_transform(df_train)
X_test = processor.transform(df_test)
y = df_train["Survived"].copy()

In [None]:
X

In [None]:
X_test

In [None]:
countNan(X)
countNan(X_test)

## Process the Name attribute

As the 'Name' attribute is more complex it requires a deeper processing 

In [None]:
#Applies the TextProcess function to simplify the Name attribute
X = pd.concat([X, pd.DataFrame(df_train["Name"].map(TextProcess))],axis="columns")
X_test = pd.concat([X_test, pd.DataFrame(df_test["Name"].map(TextProcess))],axis="columns")

In [None]:
X

In [None]:
X_test

In [None]:
PlotSurvCorrelation(pd.concat([pd.DataFrame(df_train["Name"].map(TextProcess)), y], axis="columns"), "Name")

In [None]:
#Apply encoding to the Name attribute
processed_name = processor.encoder.fit_transform(np.array(X["Name"]).reshape(-1,1)).toarray()
X.drop(["Name"], inplace=True,axis=1)
X = pd.concat([X, pd.DataFrame(processed_name, columns=processor.encoder.get_feature_names_out())],axis="columns")
X

In [None]:
processed_name = processor.encoder.transform(np.array(X_test["Name"]).reshape(-1,1)).toarray()
X_test.drop("Name",inplace=True,axis=1)
X_test = pd.concat([X_test, pd.DataFrame(processed_name, columns=processor.encoder.get_feature_names_out())],axis="columns")
X_test

### Exclude columns with too little positive values

In [None]:
#Select the columns with less than 41 positive values to simplify the dataset and prevent overfitting
columns = []
for name in processor.encoder.get_feature_names_out():
    if X[name].value_counts()[1] < 41:
        columns.append(name)

columns

In [None]:
X.drop(columns, axis=1, inplace=True)
X

In [None]:
X_test.drop(columns, axis=1, inplace=True)
X_test

## Process Ticket attribute 

Same thing with the Ticket attribute
###### 
As it is unique for each passenger, we will need to find patterns that could be useful to our model
###### 
Starting with the prefix in the ticket

In [None]:
train_tickets_prefixes = {}
test_tickets_prefixes = {}

#Use regular expressions to get the suffix of the ticket and create a dictionary with the count 
for train_ticket,test_ticket in zip(df_train['Ticket'].unique(), df_test['Ticket']):
    train_text_ticket = re.search(r'\w+/*.* ', train_ticket)
    test_text_ticket = re.search(r'\w+/*.* ', test_ticket)
    if train_text_ticket is not None: 
        if train_text_ticket[0] not in train_tickets_prefixes: 
            train_tickets_prefixes[train_text_ticket[0]]=1
        else:
            train_tickets_prefixes[train_text_ticket[0]]+=1
    if test_text_ticket is not None: 
        if test_text_ticket[0] not in test_tickets_prefixes: 
            test_tickets_prefixes[test_text_ticket[0]]=1
        else:
            test_tickets_prefixes[test_text_ticket[0]]+=1
print('Train ticket prefixes: ')
print(train_tickets_prefixes)
print('Count: ', len(train_tickets_prefixes))
print()
print('Test ticket prefixes: ')
print(test_tickets_prefixes)
print('Count: ', len(test_tickets_prefixes))

In [None]:
df_prefix = pd.DataFrame(data=train_tickets_prefixes, index=['Count']).transpose()
df_prefix

In [None]:
df_prefix.plot(kind='bar', figsize = (10,6) )

In [None]:
#Create a Data frame to compare the survivied attribute with the prefix existence
df_ticket = df_train[['Survived', 'Ticket']].copy()

df_ticket['Ticket prefix']=0
for i, ticket in enumerate(df_ticket['Ticket']):
    prefix = re.search(r'\w+/*.* ', ticket)
    if prefix is not None and prefix[0] in train_tickets_prefixes.keys():
        df_ticket['Ticket prefix'][i] = 1

In [None]:
df_ticket

In [None]:
PlotSurvCorrelation(df_ticket, 'Ticket prefix')

Then evaluating the first number in the ticket and the length of it

In [None]:
df_ticket['Ticket Len'] = 0
df_ticket['Ticket start'] = 0
df_ticket

In [None]:
#Collect the first number and the length of the ticket sufix to contrast them with the Survived attribute
for i,ticket in enumerate(df_ticket['Ticket']):
    ticket_sufix = re.search(r'\d\d+', ticket)
    if ticket_sufix is not None:
        df_ticket['Ticket Len'][i] = len(ticket_sufix[0])
        df_ticket['Ticket start'][i] = int(ticket_sufix[0][0])

In [None]:
df_ticket

In [None]:
PlotSurvCorrelation(df_ticket, 'Ticket Len')

In [None]:
PlotSurvCorrelation(df_ticket, 'Ticket start')

In [None]:
#Add the created features to the X dataset
ticket_start = pd.DataFrame(processor.encoder.fit_transform(np.array(df_ticket[['Ticket start']])).toarray(), 
                           columns=['Ticket_0', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4', 'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9'])
ticket_len = processor.scaler.transform(np.array(df_ticket['Ticket Len']).reshape(1, -1))

In [None]:
X['Ticket len']=ticket_len.reshape(-1,1)
X = pd.concat([X, ticket_start], axis='columns')

In [None]:
X

In [None]:
#All the anterior processing applied to the test dataset
df_ticket = df_test[['Ticket']].copy()

df_ticket['Ticket prefix']=0
for i, ticket in enumerate(df_ticket['Ticket']):
    prefix = re.search(r'\w+/*.* ', ticket)
    if prefix is not None and prefix[0] in train_tickets_prefixes.keys():
        df_ticket['Ticket prefix'][i] = 1
        
df_ticket['Ticket Len'] = 0
df_ticket['Ticket start'] = 0

for i,ticket in enumerate(df_ticket['Ticket']):
    ticket_sufix = re.search(r'\d\d+', ticket)
    if ticket_sufix is not None:
        df_ticket['Ticket Len'][i] = len(ticket_sufix[0])
        df_ticket['Ticket start'][i] = int(ticket_sufix[0][0])

ticket_start = pd.DataFrame(processor.encoder.transform(np.array(df_ticket[['Ticket start']])).toarray(), 
                           columns=['Ticket_0', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4', 'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9'])
ticket_len = processor.scaler.transform(np.array(df_ticket['Ticket Len']).reshape(1, -1))

X_test['Ticket len']=ticket_len.reshape(-1,1)
X_test = pd.concat([X_test, ticket_start], axis='columns')

### Exclude columns with too little positive values

In [None]:
columns = []
for name in ['Ticket_0', 'Ticket_1', 'Ticket_2', 'Ticket_3', 'Ticket_4', 'Ticket_5', 'Ticket_6', 'Ticket_7', 'Ticket_8', 'Ticket_9']:
    if X[name].value_counts()[1] < 200:
        columns.append(name)

columns

In [None]:
X.drop(columns, axis=1, inplace=True)
X

In [None]:
X_test.drop(columns, axis=1, inplace=True)
X_test

# Divide Dataset 

In [None]:
#Divide the dataset into train-validation sets to reduce overfitting and have a better evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
print("Training Set lenght: {}".format(len(X_train)))
print("Validation Set lenght: {}".format(len(X_val)))

In [None]:
X_train.info()

In [None]:
X_train

In [None]:
X_test

# Model construction 

We will train and test some popular models to compare them and choose the best ones to our voting classifier
###### 
Also, to improve performance, we're going to use GridSearchCV to find the hiperparameters that improve the score the best.


## Random Forest

In [None]:
#Train a Random Forest classifier with default parameters
clf_rnd = RandomForestClassifier( 
                                 n_jobs=-1, 
                                 random_state=42)
clf_rnd.fit(X_train, y_train)

In [None]:
predict(X_train, y_train, X_val, y_val, clf_rnd)

One of the benefits of the Random Forest model is that it allow us to see which ones could be the most important features in the dataset

In [None]:
#Now we can get the most important features of our dataset
feature_importances = {name: score for name, score in zip(list(X_train), clf_rnd.feature_importances_)}

In [None]:
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)
feature_importances_sorted.head(10)

In [None]:
#Create a reduced version of the datasets with the most important features
columns =  [feature for feature in feature_importances_sorted.index if feature_importances_sorted[feature] > 0.02]
X_train_reduced = X_train[columns].copy()
X_val_reduced = X_val[columns].copy()
X_test_reduced = X_test[columns].copy()

In [None]:
X_train_reduced

### Search for the best hiperparams with GridSearchCV 

In [None]:
# Search the best hiperparameters for the dataset
param_grid = {"n_estimators": [50, 75, 100], 
               "max_leaf_nodes":[16, 32, None], 
               "min_samples_leaf":[2, 4, 8],
               "max_features": [None, "log2", "sqrt"],
               "max_depth": [16, 32, None], 
               "min_samples_split": [2, 4, 8],
               "criterion": ["gini", "entropy", "log_loss"]} # Hiperparameter combinations

clf_rnd = RandomForestClassifier(random_state=42) # Model instance
grid_rnd = GridSearchCV(clf_rnd, param_grid=param_grid, cv=5, n_jobs=-1) #Grid Search instance
start_time = time.time()
grid_rnd.fit(X_train, y_train) # Model training
end_time = time.time()
final_time = end_time-start_time #Timing the training time
print("The training lasted: {} : {} : {:.2f}".format(final_time//3600, (final_time%3600)//60, final_time%3600%60))

In [None]:
#Show the parameteres that got the best score
grid_rnd.best_params_

In [None]:
predict(X_train,y_train,X_val, y_val, grid_rnd)

In [None]:
grid_rnd.best_score_

In [None]:
#Feature importance with the new model
feature_importances = {name: score for name, score in zip(list(X_train), grid_rnd.best_estimator_.feature_importances_)}

In [None]:
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)
feature_importances_sorted.head(20)

In [None]:
important_features = [feature for feature, value in zip(feature_importances_sorted.index, feature_importances_sorted) if value >0.05]
important_features

In [None]:
X_train_reduced = X_train[important_features].copy()
X_val_reduced = X_val[important_features].copy()
X_test_reduced = X_test[important_features].copy()

## Logistic Regression

In [None]:
# Train a Logistic Regression model with default parameters
clf_lr = LogisticRegression()
clf_lr.fit(X_train,y_train)

In [None]:
predict(X_train,y_train, X_val, y_val, clf_lr)

### Search for the best hiperparams with GridSearchCV 

In [None]:
# Hiperparameter combinations
param_grid = [{"solver":['lbfgs', 'newton-cg', 'sag'],
              "penalty": ["l2"], 
              "fit_intercept": [True, False], 
              "C": [0.1, 0.5, 1, 5, 10]
              }, 
              {'solver':['liblinear'], 
               'penalty':['l1', 'l2'], 
               "fit_intercept": [True, False], 
               "C": [0.1, 0.5, 1, 5, 10]
              }, 
               {'solver': ['saga'], 
                'penalty':['elasticnet'], 
                'l1_ratio': [0.25, 0.5, 0.25]
               }] 
               
clf_lr = LogisticRegression(random_state=42, max_iter=int(1e3)) # Model instance
grid_lr = GridSearchCV(clf_lr, param_grid=param_grid, cv=5, n_jobs=-1) # Grid Search instance
start_time = time.time()
grid_lr.fit(X_train, y_train) # Model training
end_time = time.time()
final_time = end_time-start_time # Timing the training time
print("The training lasted: {} : {} : {:.2f}".format(final_time//3600, (final_time%3600)//60, final_time%3600%60))

In [None]:
# Show the parameteres that got the best score
grid_lr.best_params_

In [None]:
predict(X_train,y_train, X_val, y_val, grid_lr)

In [None]:
grid_lr.best_score_

## Poly SVM

In [None]:
# Train a SVC model with a linear kernel with default parameters
clf_svm = SVC(kernel="linear", random_state=42)
clf_svm.fit(X_train,y_train)

In [None]:
predict(X_train ,y_train, X_val, y_val, clf_svm)

### Search for the best hiperparams with GridSearchCV 

In [None]:
# Hiperparameter combinations 
param_grid = {"kernel": ["poly"], 
              "degree": [2, 3], 
              "coef0": [0, 0.1, 1, 5], 
              "C": [0.1, 1, 5], 
              "gamma": [5, 10, 15]}

clf_svm = SVC(random_state=42)# Model instance
grid_svm = GridSearchCV(clf_svm, param_grid=param_grid, n_jobs=-1, cv=5, return_train_score=True) # Grid Search instance
start_time = time.time()
grid_svm.fit(X_train, y_train) # Model training
end_time = time.time()
final_time = end_time-start_time # Timing the training time
print("The training lasted: {} : {} : {:.2f}".format(final_time//3600, (final_time%3600)//60, final_time%3600%60))

In [None]:
# Show the parameteres that got the best score
grid_svm.best_params_

In [None]:
predict(X_train,y_train, X_val, y_val, grid_svm)

In [None]:
grid_svm.best_score_

## Gaussian SVM

In [None]:
# Train a SVC model with 'rbf' kernel with default parameters
clf_rbf = SVC(kernel="rbf", random_state=42)
clf_rbf.fit(X_train, y_train)

In [None]:
predict(X_train,y_train, X_val, y_val, clf_rbf)

### Search for the best hiperparams with GridSearchCV 

In [None]:
# Hiperparameter combinations
param_grid = {"kernel": ["rbf"], 
            "gamma": [0.01, 0.1, 1, 5, 10, 20, 'scale', 'auto'], 
              "C": [0.01, 0.1, 1, 4, 5, 10, 20],
             "random_state": [42]}

clf_rbf = SVC() # Model instance
grid_rbf = GridSearchCV(clf_rbf, param_grid=param_grid, n_jobs=-1, cv=5, return_train_score=True) # Grid Search instance
start_time = time.time()
grid_rbf.fit(X_train, y_train) # Model training
end_time = time.time()
final_time = end_time-start_time # Timing the training time
print("The training lasted: {} : {} : {:.2f}".format(final_time//3600, (final_time%3600)//60, final_time%3600%60)) 

In [None]:
# Show the parameteres that got the best score
grid_rbf.best_params_

In [None]:
predict(X_train,y_train, X_val, y_val, grid_rbf)

In [None]:
grid_rbf.best_score_

## Naive Bayes 

In [None]:
# Train a Naive Bayes model with default parameters
clf_nb = BernoulliNB()
clf_nb.fit(X_train ,y_train)

In [None]:
predict(X_train,y_train, X_val, y_val, clf_nb)

### Search for the best hiperparams with GridSearchCV 

In [None]:
param_grid = {"alpha":[1.0e-10, 5e-10, 0, 1, 5]} # Hiperparameter combinations


clf_nb = BernoulliNB() # Model instance
grid_nb = GridSearchCV(clf_nb, param_grid=param_grid, n_jobs=-1, cv=5) # Grid Search instance
start_time = time.time() 
grid_nb.fit(X_train, y_train) # Model training
end_time = time.time()
final_time = end_time-start_time # Timing the training time
print("The training lasted: {} : {} : {:.2f}".format(final_time//3600, (final_time%3600)//60, final_time%3600%60))

In [None]:
# Show the parameteres that got the best score
grid_nb.best_params_

In [None]:
predict(X_train,y_train, X_val, y_val, grid_nb)

In [None]:
grid_nb.best_score_

## Ridge Classifier

In [None]:
# Train a Ridge classifier with default parameters
rdg_clf = RidgeClassifier(random_state=42)
rdg_clf.fit(X_train, y_train)

In [None]:
predict(X_train, y_train, X_val, y_val, rdg_clf)

### Search for the best hiperparams with GridSearchCV

In [None]:
# Hiperparameter combinations
param_grid = [{'alpha':[0.5, 1, 2, 4], 
              'solver': ['svd', 'cholesky', 'sparse_cg', 'lsqr', 'sag', 'auto'], 
              'tol': [1e-4, 1e-3, 1e-2], 
              'positive':[False]}, 
              {'alpha':[0.5, 1, 2 , 4], 
              'solver':['lbfgs', 'auto'], 
              'positive': [True], 
              'tol': [1e-4, 1e-3, 1e-2],
              }]

clf_rdg = RidgeClassifier(random_state=42) # Model instance
grid_rdg=GridSearchCV(clf_rdg, param_grid=param_grid, cv=5, n_jobs=-1) # Grid Search instance
start_time = time.time()
grid_rdg.fit(X_train, y_train) # Model training
end_time = time.time()
final_time = end_time-start_time # Timing the training time
print("The training lasted: {} : {} : {:.2f}".format(final_time//3600, (final_time%3600)//60, final_time%3600%60))

In [None]:
# Show the parameteres that got the best score
grid_rdg.best_params_

In [None]:
predict(X_train, y_train, X_val, y_val, grid_rdg)

In [None]:
grid_rdg.best_score_

## Perceptron

In [None]:
# Train a Perceptron model with default parameters
clf_pct = Perceptron(random_state=42)
clf_pct.fit(X_train, y_train)

In [None]:
predict(X_train, y_train, X_val, y_val, clf_pct)

### Search for the best hiperparams with GridSearchCV

In [None]:
# Hiperparameter combinations
param_grid=[{'penalty':['l1', 'l2', None], 
            'alpha' : [0,0.01, 0.001, 0.0001, 0.1, 1, 5, 10], 
            'eta0' : [0.1, 1, 5, 10], 
            'early_stopping' :[True, False]}, 
            {'penalty':['elasticnet'], 
            'alpha' : [0,0.01, 0.001, 0.0001, 0.1, 1, 5, 10], 
            'eta0' : [0.1, 1, 5, 10], 
            'early_stopping' :[True, False], 
            'l1_ratio':[0.25, 0.5, 0.25]}
           ]      

clf_pct = Perceptron(random_state=42) # Model instance
grid_pct = GridSearchCV(clf_pct, param_grid=param_grid, cv=5, n_jobs=-1) # Grid Search instance
start_time = time.time()
grid_pct.fit(X_train, y_train) # Model training
end_time = time.time()
final_time = end_time-start_time # Timing the training time
print("The training lasted: {} : {} : {:.2f}".format(final_time//3600, (final_time%3600)//60, final_time%3600%60))

In [None]:
# Show the parameteres that got the best score
grid_pct.best_params_

In [None]:
predict(X_train, y_train, X_val, y_val, grid_pct)

In [None]:
grid_pct.best_score_

## Ensembles: Voting classifier

Now that all the model are trained and we checked their scores, we can choose some of them and combine them in a Voting classifier

In [None]:
#Select some classifiers, combine and train them 
clf_vote = VotingClassifier(estimators=[('Random Forest', grid_rnd.best_estimator_),
                                        ('RBF SVM', grid_rbf.best_estimator_)])
clf_vote.fit(X, y)

In [None]:
predict(X_train, y_train, X_val, y_val, clf_vote)

# Create submission 

Now that we have our final model, it's time to make the prediction and upload it to the competition

In [None]:
final_model = clf_vote
CreateSub(df_test, X_test, final_model, "submission")

In [None]:
final_model

# Save the model

It is algo possible to save the model and load it later to compare it or conserve it for latter use

In [None]:
dump(clf_vote, '/kaggle/working/clf_vote_5.joblib')

## Load anterior model

In [None]:
clf_vote_1 = load('/kaggle/working/clf_vote_5.joblib')

In [None]:
clf_vote_1