In [None]:
# Installing the feature-engine library that is not installed by default in the Kaggle Noteboooks.
# !pip install feature-engine

In [1]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)

In [2]:
def importing_data(train_path, validation_path):

    train_data = pd.read_csv(train_path)
    validation_data = pd.read_csv(validation_path).drop(columns = ['PassengerId'])
    validation_data_ids = pd.read_csv(validation_path)['PassengerId']

    return train_data, validation_data, validation_data_ids

train_titanic_path = 'datasets/train.csv'
validation_titanic_path = 'datasets/test.csv'

train_titanic, validation_titanic, validation_data_ids = importing_data(train_titanic_path, validation_titanic_path)


In [3]:
train_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
validation_titanic.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
validation_data_ids.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

In [6]:
# Dropping the "PassengerId" column from the training data.
def dropping_train_id(train_data, id_column):
    
    train_data = train_data.drop(columns = [id_column])

    return train_data

train_titanic = dropping_train_id(train_titanic, 'PassengerId')

In [7]:
# Renaming the values of the "Pclass" column for the training and validation data.
def renaming_pclass_values(dataframe, pclass_column):

    dataframe[pclass_column] = dataframe[pclass_column].replace({1: 'first class', 2: 'second class', 3: 'third class'})

    return dataframe

train_titanic = renaming_pclass_values(train_titanic, 'Pclass')
validation_titanic = renaming_pclass_values(validation_titanic, 'Pclass')

In [8]:
# Creating the "SocialTitle" column using the social abbreviations from the "Name" column.

def extracting_social_title(dataframe, name_column):

    social_titles = list()

    for name in dataframe[name_column].to_list():
        
        start = name.find(', ') + len(', ')
        end = name.find('. ')
        title = name[start:end]
        social_titles.append(str(title))

    dataframe['SocialTitle'] = social_titles

    return dataframe

train_titanic = extracting_social_title(train_titanic, 'Name')
validation_titanic = extracting_social_title(validation_titanic, 'Name')

In [9]:
train_titanic

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SocialTitle
0,0,third class,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,1,first class,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,1,third class,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,1,first class,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,0,third class,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,second class,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev
887,1,first class,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss
888,0,third class,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss
889,1,first class,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr


In [10]:
# Generating the "NameType" column using the name column, which distinguishes between individual and double names.
def extracting_name_type(dataframe, name_column):

    name_type = list()

    for name in dataframe[name_column].to_list():
        
        if '(' in name:
            name_type.append('double name')
        else:
            name_type.append('single name')

    dataframe['NameType'] = name_type

    return dataframe

train_titanic = extracting_name_type(train_titanic, 'Name')
validation_titanic = extracting_name_type(validation_titanic, 'Name')

In [11]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SocialTitle,NameType
0,0,third class,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,single name
1,1,first class,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,double name
2,1,third class,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,single name
3,1,first class,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,double name
4,0,third class,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,single name


In [12]:
# Extracting the surnames of the passengers using the "Name" column. Finally dropping the "Name" column.
def extracting_surname(dataframe, name_column):

    surnames = list()

    for name in dataframe[name_column].to_list():
        
        start = name.find('') + len('')
        end = name.find(', ')
        surname = name[start:end]
        surnames.append(str(surname))

    dataframe['Surname'] = surnames
    dataframe = dataframe.drop(columns = [name_column])

    return dataframe

train_titanic = extracting_surname(train_titanic, 'Name')
validation_titanic = extracting_surname(validation_titanic, 'Name')

In [13]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SocialTitle,NameType,Surname
0,0,third class,male,22.0,1,0,A/5 21171,7.25,,S,Mr,single name,Braund
1,1,first class,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,double name,Cumings
2,1,third class,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,single name,Heikkinen
3,1,first class,female,35.0,1,0,113803,53.1,C123,S,Mrs,double name,Futrelle
4,0,third class,male,35.0,0,0,373450,8.05,,S,Mr,single name,Allen


In [14]:
# Summing up the "Parch" and the "SibSp" columns to get the "TotalCompanions" column, taking into account 
# the value of the passenger.

def extracting_total_companions(dataframe, parch_column, sibsp_column):

    dataframe['TotalCompanions'] = dataframe[parch_column] + dataframe[sibsp_column] + 1

    return dataframe

train_titanic = extracting_total_companions(train_titanic, 'Parch', 'SibSp')
validation_titanic = extracting_total_companions(validation_titanic, 'Parch', 'SibSp')

In [15]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SocialTitle,NameType,Surname,TotalCompanions
0,0,third class,male,22.0,1,0,A/5 21171,7.25,,S,Mr,single name,Braund,2
1,1,first class,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,double name,Cumings,2
2,1,third class,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,single name,Heikkinen,1
3,1,first class,female,35.0,1,0,113803,53.1,C123,S,Mrs,double name,Futrelle,2
4,0,third class,male,35.0,0,0,373450,8.05,,S,Mr,single name,Allen,1


In [16]:
# Extracting all the posible information from the "Ticket" column. The letters, the numbers, the symbols and 
# the total characters.
def dissecting_ticket(dataframe, ticket_column):

    letters = list()
    numbers = list()
    symbols = list()

    characters = list()

    for ticket in dataframe[ticket_column].to_list():

        letter_count = 0
        number_count = 0
        symbol_count = 0
        
        for character in ticket:

            if character.isalpha():
                letter_count += 1

            elif character.isnumeric():
                number_count += 1

            elif character == " ":
                pass

            else:
                symbol_count += 1
        
        letters.append(letter_count)
        numbers.append(number_count)
        symbols.append(symbol_count)

        total_characters = letter_count + number_count + symbol_count
        characters.append(total_characters)

    dataframe['TicketLetters'] = letters
    dataframe['TicketNumbers'] = numbers
    dataframe['TicketSymbols'] = symbols
    dataframe['TicketCharacters'] = characters

    dataframe = dataframe.drop(columns = [ticket_column])

    return dataframe

train_titanic = dissecting_ticket(train_titanic, 'Ticket')
validation_titanic = dissecting_ticket(validation_titanic, 'Ticket')

In [17]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters
0,0,third class,male,22.0,1,0,7.25,,S,Mr,single name,Braund,2,1,6,1,8
1,1,first class,female,38.0,1,0,71.2833,C85,C,Mrs,double name,Cumings,2,2,5,0,7
2,1,third class,female,26.0,0,0,7.925,,S,Miss,single name,Heikkinen,1,5,8,2,15
3,1,first class,female,35.0,1,0,53.1,C123,S,Mrs,double name,Futrelle,2,0,6,0,6
4,0,third class,male,35.0,0,0,8.05,,S,Mr,single name,Allen,1,0,6,0,6


In [18]:
# Getting the letter of the cabin using the "Cabin" column. Generating a new column named "CabinLetter".
def extracting_cabin_letter(dataframe, cabin_column):

    letters = list()

    dataframe[cabin_column] = dataframe[cabin_column].fillna('unknown')

    for cabin in dataframe[cabin_column].to_list():

        if cabin == 'unknown':
            letters.append('unknown')

        else:
            letters.append(str(cabin[0]))


    dataframe['CabinLetter'] = letters
    dataframe = dataframe.drop(columns = [cabin_column])

    return dataframe

train_titanic = extracting_cabin_letter(train_titanic, 'Cabin')
validation_titanic = extracting_cabin_letter(validation_titanic, 'Cabin')

In [19]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter
0,0,third class,male,22.0,1,0,7.25,S,Mr,single name,Braund,2,1,6,1,8,unknown
1,1,first class,female,38.0,1,0,71.2833,C,Mrs,double name,Cumings,2,2,5,0,7,C
2,1,third class,female,26.0,0,0,7.925,S,Miss,single name,Heikkinen,1,5,8,2,15,unknown
3,1,first class,female,35.0,1,0,53.1,S,Mrs,double name,Futrelle,2,0,6,0,6,C
4,0,third class,male,35.0,0,0,8.05,S,Mr,single name,Allen,1,0,6,0,6,unknown


In [20]:
# Renaming the values of the "Embarked" column for the training and validation data.
def renaming__embarked_values(dataframe, embarked_column):

    dataframe[embarked_column] = dataframe[embarked_column].replace({'S': 'southampton', 'C': 'cherbourg', 'Q': 'queenstown'})

    return dataframe

train_titanic = renaming__embarked_values(train_titanic, 'Embarked')
validation_titanic = renaming__embarked_values(validation_titanic, 'Embarked')

In [21]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter
0,0,third class,male,22.0,1,0,7.25,southampton,Mr,single name,Braund,2,1,6,1,8,unknown
1,1,first class,female,38.0,1,0,71.2833,cherbourg,Mrs,double name,Cumings,2,2,5,0,7,C
2,1,third class,female,26.0,0,0,7.925,southampton,Miss,single name,Heikkinen,1,5,8,2,15,unknown
3,1,first class,female,35.0,1,0,53.1,southampton,Mrs,double name,Futrelle,2,0,6,0,6,C
4,0,third class,male,35.0,0,0,8.05,southampton,Mr,single name,Allen,1,0,6,0,6,unknown


In [22]:
# Imputing the "Age" column in the training and validation data using the Iterative Imputer function.
def imputing_age(dataframe_train, dataframe_test, age_column):

    iterative_imputer = IterativeImputer()

    dataframe_train[age_column] = iterative_imputer.fit_transform(dataframe_train[age_column].values.reshape(-1,1))
    dataframe_train[age_column] = dataframe_train[age_column].astype('int64')

    dataframe_test[age_column] = iterative_imputer.transform(dataframe_test[age_column].values.reshape(-1,1))
    dataframe_test[age_column] = dataframe_test[age_column].astype('int64')

    return dataframe_train, dataframe_test

train_titanic, validation_titanic = imputing_age(train_titanic, validation_titanic, 'Age')

In [23]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter
0,0,third class,male,22,1,0,7.25,southampton,Mr,single name,Braund,2,1,6,1,8,unknown
1,1,first class,female,38,1,0,71.2833,cherbourg,Mrs,double name,Cumings,2,2,5,0,7,C
2,1,third class,female,26,0,0,7.925,southampton,Miss,single name,Heikkinen,1,5,8,2,15,unknown
3,1,first class,female,35,1,0,53.1,southampton,Mrs,double name,Futrelle,2,0,6,0,6,C
4,0,third class,male,35,0,0,8.05,southampton,Mr,single name,Allen,1,0,6,0,6,unknown


In [24]:
# Imputing the "Embarked" column in the training data using the values from similar passengers to 
# the missing ones.
def imputing_embarked(dataframe, embarked_column):

    dataframe[embarked_column] = dataframe[embarked_column].fillna('cherbourg')

    return dataframe

train_titanic = imputing_embarked(train_titanic, 'Embarked')

In [25]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter
0,0,third class,male,22,1,0,7.25,southampton,Mr,single name,Braund,2,1,6,1,8,unknown
1,1,first class,female,38,1,0,71.2833,cherbourg,Mrs,double name,Cumings,2,2,5,0,7,C
2,1,third class,female,26,0,0,7.925,southampton,Miss,single name,Heikkinen,1,5,8,2,15,unknown
3,1,first class,female,35,1,0,53.1,southampton,Mrs,double name,Futrelle,2,0,6,0,6,C
4,0,third class,male,35,0,0,8.05,southampton,Mr,single name,Allen,1,0,6,0,6,unknown


In [26]:
# Imputing the "Fare" column in the validation data using the Iterative Imputer function.
def imputing_fare(dataframe_train, dataframe_validation, fare_column):

    iterative_imputer = IterativeImputer()

    iterative_imputer.fit(dataframe_train[fare_column].values.reshape(-1,1))

    dataframe_validation[fare_column] = iterative_imputer.transform(dataframe_validation[fare_column].values.reshape(-1,1))

    return dataframe_validation

validation_titanic = imputing_fare(train_titanic, validation_titanic, 'Fare')

In [27]:
train_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter
0,0,third class,male,22,1,0,7.25,southampton,Mr,single name,Braund,2,1,6,1,8,unknown
1,1,first class,female,38,1,0,71.2833,cherbourg,Mrs,double name,Cumings,2,2,5,0,7,C
2,1,third class,female,26,0,0,7.925,southampton,Miss,single name,Heikkinen,1,5,8,2,15,unknown
3,1,first class,female,35,1,0,53.1,southampton,Mrs,double name,Futrelle,2,0,6,0,6,C
4,0,third class,male,35,0,0,8.05,southampton,Mr,single name,Allen,1,0,6,0,6,unknown


In [28]:
# Encoding by frequency the variable "SocialTitle", the variable "Surname" and the variable "CabinLetter".
def frequency_social_surnames_cabinletter(dataframe_train, dataframe_validation, socialtitle_column, surname_column, cabinletter_column, target):

    y_variable = dataframe_train[target]
    dataframe_train = dataframe_train.drop(columns = [target])

    rare_label_encoder = RareLabelEncoder(tol = 0.002, n_categories = 2, variables = [socialtitle_column, surname_column, cabinletter_column], replace_with = 'rare')
    
    dataframe_train = rare_label_encoder.fit_transform(dataframe_train)
    dataframe_validation = rare_label_encoder.transform(dataframe_validation)

    frequency_encoder = MeanEncoder(variables = [socialtitle_column, surname_column, cabinletter_column])

    dataframe_train = frequency_encoder.fit_transform(dataframe_train, y_variable)
    dataframe_validation = frequency_encoder.transform(dataframe_validation)

    dataframe_train[target] = y_variable

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = frequency_social_surnames_cabinletter(train_titanic, validation_titanic, 'SocialTitle', 'Surname', 'CabinLetter', 'Survived')

In [31]:
train_titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter,Survived
0,third class,male,22,1,0,7.25,southampton,0.156673,single name,0.0,2,1,6,1,8,0.299854,0
1,first class,female,38,1,0,71.2833,cherbourg,0.792,double name,0.359551,2,2,5,0,7,0.59322,1
2,third class,female,26,0,0,7.925,southampton,0.697802,single name,0.359551,1,5,8,2,15,0.299854,1
3,first class,female,35,1,0,53.1,southampton,0.792,double name,0.5,2,0,6,0,6,0.59322,1
4,third class,male,35,0,0,8.05,southampton,0.156673,single name,0.5,1,0,6,0,6,0.299854,0


In [32]:
# Transforming the "Age" column in categorical Bins.
def binning_age_column(dataframe_train, dataframe_validation, age_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 6, encode = 'ordinal', strategy = 'quantile')

    dataframe_train[age_column] = k_bins_discretizer.fit_transform(dataframe_train[age_column].values.reshape(-1,1))
    dataframe_validation[age_column] = k_bins_discretizer.transform(dataframe_validation[age_column].values.reshape(-1,1))

    dataframe_train[age_column] = dataframe_train[age_column].astype('int64')
    dataframe_validation[age_column] = dataframe_validation[age_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_age_column(train_titanic, validation_titanic, 'Age')

In [33]:
train_titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter,Survived
0,third class,male,1,1,0,7.25,southampton,0.156673,single name,0.0,2,1,6,1,8,0.299854,0
1,first class,female,4,1,0,71.2833,cherbourg,0.792,double name,0.359551,2,2,5,0,7,0.59322,1
2,third class,female,2,0,0,7.925,southampton,0.697802,single name,0.359551,1,5,8,2,15,0.299854,1
3,first class,female,4,1,0,53.1,southampton,0.792,double name,0.5,2,0,6,0,6,0.59322,1
4,third class,male,4,0,0,8.05,southampton,0.156673,single name,0.5,1,0,6,0,6,0.299854,0


In [34]:
# Transforming the "Fare" column in categorical Bins.
def binning_fare_column(dataframe_train, dataframe_validation, fare_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 6, encode = 'ordinal', strategy = 'quantile')

    dataframe_train[fare_column] = k_bins_discretizer.fit_transform(dataframe_train[fare_column].values.reshape(-1,1))
    dataframe_validation[fare_column] = k_bins_discretizer.transform(dataframe_validation[fare_column].values.reshape(-1,1))

    dataframe_train[fare_column] = dataframe_train[fare_column].astype('int64')
    dataframe_validation[fare_column] = dataframe_validation[fare_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_fare_column(train_titanic, validation_titanic, 'Fare')

In [35]:
train_titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter,Survived
0,third class,male,1,1,0,0,southampton,0.156673,single name,0.0,2,1,6,1,8,0.299854,0
1,first class,female,4,1,0,5,cherbourg,0.792,double name,0.359551,2,2,5,0,7,0.59322,1
2,third class,female,2,0,0,1,southampton,0.697802,single name,0.359551,1,5,8,2,15,0.299854,1
3,first class,female,4,1,0,5,southampton,0.792,double name,0.5,2,0,6,0,6,0.59322,1
4,third class,male,4,0,0,1,southampton,0.156673,single name,0.5,1,0,6,0,6,0.299854,0


In [36]:
# Transforming the "SibSp" column in categorical Bins.
def binning_sibsp_column(dataframe_train, dataframe_validation, sibsp_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 2, encode = 'ordinal', strategy = 'uniform')

    dataframe_train[sibsp_column] = k_bins_discretizer.fit_transform(dataframe_train[sibsp_column].values.reshape(-1,1))
    dataframe_validation[sibsp_column] = k_bins_discretizer.transform(dataframe_validation[sibsp_column].values.reshape(-1,1))

    dataframe_train[sibsp_column] = dataframe_train[sibsp_column].astype('int64')
    dataframe_validation[sibsp_column] = dataframe_validation[sibsp_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_sibsp_column(train_titanic, validation_titanic, 'SibSp')

In [37]:
# Transforming the "Parch" column in categorical Bins.
def binning_parch_column(dataframe_train, dataframe_validation, parch_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 2, encode = 'ordinal', strategy = 'uniform')

    dataframe_train[parch_column] = k_bins_discretizer.fit_transform(dataframe_train[parch_column].values.reshape(-1,1))
    dataframe_validation[parch_column] = k_bins_discretizer.transform(dataframe_validation[parch_column].values.reshape(-1,1))

    dataframe_train[parch_column] = dataframe_train[parch_column].astype('int64')
    dataframe_validation[parch_column] = dataframe_validation[parch_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_parch_column(train_titanic, validation_titanic, 'Parch')

In [38]:
# Transforming the "TotalCompanions" column in categorical Bins.
def binning_totaltompanions_column(dataframe_train, dataframe_validation, totalcompanions_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 2, encode = 'ordinal', strategy = 'uniform')

    dataframe_train[totalcompanions_column] = k_bins_discretizer.fit_transform(dataframe_train[totalcompanions_column].values.reshape(-1,1))
    dataframe_validation[totalcompanions_column] = k_bins_discretizer.transform(dataframe_validation[totalcompanions_column].values.reshape(-1,1))

    dataframe_train[totalcompanions_column] = dataframe_train[totalcompanions_column].astype('int64')
    dataframe_validation[totalcompanions_column] = dataframe_validation[totalcompanions_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_totaltompanions_column(train_titanic, validation_titanic, 'TotalCompanions')

In [39]:
# Transforming the "TicketLetters" column in categorical Bins.
def binning_ticketletters_column(dataframe_train, dataframe_validation, ticketletters_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 2, encode = 'ordinal', strategy = 'uniform')

    dataframe_train[ticketletters_column] = k_bins_discretizer.fit_transform(dataframe_train[ticketletters_column].values.reshape(-1,1))
    dataframe_validation[ticketletters_column] = k_bins_discretizer.transform(dataframe_validation[ticketletters_column].values.reshape(-1,1))

    dataframe_train[ticketletters_column] = dataframe_train[ticketletters_column].astype('int64')
    dataframe_validation[ticketletters_column] = dataframe_validation[ticketletters_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_ticketletters_column(train_titanic, validation_titanic, 'TicketLetters')

In [40]:
# Transforming the "TicketNumbers" column in categorical Bins.
def binning_ticketnumbers_column(dataframe_train, dataframe_validation, ticketnumbers_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 3, encode = 'ordinal', strategy = 'quantile')

    dataframe_train[ticketnumbers_column] = k_bins_discretizer.fit_transform(dataframe_train[ticketnumbers_column].values.reshape(-1,1))
    dataframe_validation[ticketnumbers_column] = k_bins_discretizer.transform(dataframe_validation[ticketnumbers_column].values.reshape(-1,1))

    dataframe_train[ticketnumbers_column] = dataframe_train[ticketnumbers_column].astype('int64')
    dataframe_validation[ticketnumbers_column] = dataframe_validation[ticketnumbers_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_ticketnumbers_column(train_titanic, validation_titanic, 'TicketNumbers')

In [41]:
# Transforming the "TicketSymbols" column in categorical Bins.
def binning_ticketsymbols_column(dataframe_train, dataframe_validation, ticketsymbols_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 2, encode = 'ordinal', strategy = 'uniform')

    dataframe_train[ticketsymbols_column] = k_bins_discretizer.fit_transform(dataframe_train[ticketsymbols_column].values.reshape(-1,1))
    dataframe_validation[ticketsymbols_column] = k_bins_discretizer.transform(dataframe_validation[ticketsymbols_column].values.reshape(-1,1))

    dataframe_train[ticketsymbols_column] = dataframe_train[ticketsymbols_column].astype('int64')
    dataframe_validation[ticketsymbols_column] = dataframe_validation[ticketsymbols_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_ticketsymbols_column(train_titanic, validation_titanic, 'TicketSymbols')

In [42]:
# Transforming the "TicketCharacters" column in categorical Bins.
def binning_ticketcharacters_column(dataframe_train, dataframe_validation, ticketcharacters_column):

    k_bins_discretizer = KBinsDiscretizer(n_bins = 2, encode = 'ordinal', strategy = 'quantile')

    dataframe_train[ticketcharacters_column] = k_bins_discretizer.fit_transform(dataframe_train[ticketcharacters_column].values.reshape(-1,1))
    dataframe_validation[ticketcharacters_column] = k_bins_discretizer.transform(dataframe_validation[ticketcharacters_column].values.reshape(-1,1))

    dataframe_train[ticketcharacters_column] = dataframe_train[ticketcharacters_column].astype('int64')
    dataframe_validation[ticketcharacters_column] = dataframe_validation[ticketcharacters_column].astype('int64')

    return dataframe_train, dataframe_validation

train_titanic, validation_titanic = binning_ticketcharacters_column(train_titanic, validation_titanic, 'TicketCharacters')

In [43]:
train_titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter,Survived
0,third class,male,1,0,0,0,southampton,0.156673,single name,0.0,0,0,2,0,1,0.299854,0
1,first class,female,4,0,0,5,cherbourg,0.792,double name,0.359551,0,0,1,0,1,0.59322,1
2,third class,female,2,0,0,1,southampton,0.697802,single name,0.359551,0,1,2,0,1,0.299854,1
3,first class,female,4,0,0,5,southampton,0.792,double name,0.5,0,0,2,0,1,0.59322,1
4,third class,male,4,0,0,1,southampton,0.156673,single name,0.5,0,0,2,0,1,0.299854,0


In [44]:
# Merging training and validation datasets to perform the encoding of all the categorical columns.
def merging_training_validation(dataframe_training, dataframe_validation, target):

    dataframe_validation['DataClass'] = 'validation'

    y_variable = dataframe_training[target]
    dataframe_training['DataClass'] = 'train'
    dataframe_training = dataframe_training.drop(columns = [target])

    dataframe = pd.concat([dataframe_training, dataframe_validation])

    return dataframe, y_variable

titanic_dataframe, y_variable = merging_training_validation(train_titanic, validation_titanic, 'Survived')

In [45]:
train_titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter,Survived,DataClass
0,third class,male,1,0,0,0,southampton,0.156673,single name,0.0,0,0,2,0,1,0.299854,0,train
1,first class,female,4,0,0,5,cherbourg,0.792,double name,0.359551,0,0,1,0,1,0.59322,1,train
2,third class,female,2,0,0,1,southampton,0.697802,single name,0.359551,0,1,2,0,1,0.299854,1,train
3,first class,female,4,0,0,5,southampton,0.792,double name,0.5,0,0,2,0,1,0.59322,1,train
4,third class,male,4,0,0,1,southampton,0.156673,single name,0.5,0,0,2,0,1,0.299854,0,train


In [46]:
# Binarising the variable "Sex" and the variable "NameType".


In [47]:
def binarising_sex_nametype(dataframe, sex_column, nametype_column):

    dataframe[sex_column] = dataframe[sex_column].replace({'male': 0, 'female': 1})
    dataframe[nametype_column] = dataframe[nametype_column].replace({'single name': 0, 'double name': 1})

    return dataframe

titanic_dataframe = binarising_sex_nametype(titanic_dataframe, 'Sex', 'NameType')

In [48]:
titanic_dataframe.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter,DataClass
0,third class,0,1,0,0,0,southampton,0.156673,0,0.0,0,0,2,0,1,0.299854,train
1,first class,1,4,0,0,5,cherbourg,0.792,1,0.359551,0,0,1,0,1,0.59322,train
2,third class,1,2,0,0,1,southampton,0.697802,0,0.359551,0,1,2,0,1,0.299854,train
3,first class,1,4,0,0,5,southampton,0.792,1,0.5,0,0,2,0,1,0.59322,train
4,third class,0,4,0,0,1,southampton,0.156673,0,0.5,0,0,2,0,1,0.299854,train


In [49]:
# One Hot Encoding the "Pclass" and the "Embarked" columns.
def one_hot_pclass_embarked(dataframe, pclass_column, embarked_column, datatype_column):

    data_type = dataframe[datatype_column]

    dataframe = dataframe.drop(columns = [datatype_column])
    dataframe = pd.get_dummies(dataframe)
    dataframe[datatype_column] = data_type

    return dataframe

titanic_dataframe = one_hot_pclass_embarked(titanic_dataframe, 'Pclass', 'Embarked', 'DataClass')

In [50]:
titanic_dataframe.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,...,TicketSymbols,TicketCharacters,CabinLetter,Pclass_first class,Pclass_second class,Pclass_third class,Embarked_cherbourg,Embarked_queenstown,Embarked_southampton,DataClass
0,0,1,0,0,0,0.156673,0,0.0,0,0,...,0,1,0.299854,0,0,1,0,0,1,train
1,1,4,0,0,5,0.792,1,0.359551,0,0,...,0,1,0.59322,1,0,0,1,0,0,train
2,1,2,0,0,1,0.697802,0,0.359551,0,1,...,0,1,0.299854,0,0,1,0,0,1,train
3,1,4,0,0,5,0.792,1,0.5,0,0,...,0,1,0.59322,1,0,0,0,0,1,train
4,0,4,0,0,1,0.156673,0,0.5,0,0,...,0,1,0.299854,0,0,1,0,0,1,train


In [51]:
# Splitting the merged titanic dataframe in the train and validation dataframes.
def train_validation_splitting(dataframe, identifier_column, target_values):

    train_dataframe = dataframe[dataframe[identifier_column] == 'train']
    train_dataframe = train_dataframe.drop(columns = [identifier_column])
    train_dataframe['Survived'] = target_values

    validation_dataframe = dataframe[dataframe[identifier_column] == 'validation']
    validation_dataframe = validation_dataframe.drop(columns = [identifier_column])

    return train_dataframe, validation_dataframe

train_titanic, validation_titanic = train_validation_splitting(titanic_dataframe, 'DataClass', y_variable)

In [52]:
# Splitting the training dataframe in the independent variables and the target variable.
def splitting_x_y(dataframe, target):

    y_variable = dataframe[target]
    x_variables = dataframe.drop(columns = [target])

    return x_variables, y_variable

x_variables, y_variable = splitting_x_y(train_titanic, 'Survived')

In [53]:
x_variables.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,SocialTitle,NameType,Surname,TotalCompanions,TicketLetters,TicketNumbers,TicketSymbols,TicketCharacters,CabinLetter,Pclass_first class,Pclass_second class,Pclass_third class,Embarked_cherbourg,Embarked_queenstown,Embarked_southampton
0,0,1,0,0,0,0.156673,0,0.0,0,0,2,0,1,0.299854,0,0,1,0,0,1
1,1,4,0,0,5,0.792,1,0.359551,0,0,1,0,1,0.59322,1,0,0,1,0,0
2,1,2,0,0,1,0.697802,0,0.359551,0,1,2,0,1,0.299854,0,0,1,0,0,1
3,1,4,0,0,5,0.792,1,0.5,0,0,2,0,1,0.59322,1,0,0,0,0,1
4,0,4,0,0,1,0.156673,0,0.5,0,0,2,0,1,0.299854,0,0,1,0,0,1


In [54]:
y_variable.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [57]:
# Training a KNeighborsClassifier applying a dimension reduction technique based on near centroids.
def training_knclassifier_nca(independent_variables, dependent_variable, validation_data):

    nca = NeighborhoodComponentsAnalysis(random_state = 42)
    model = KNeighborsClassifier(n_neighbors = 50)

    model_pipeline = Pipeline([('nca', nca), ('knn', model)])
    model_pipeline.fit(independent_variables, dependent_variable)
    scores = cross_val_score(model_pipeline, independent_variables, dependent_variable, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

    predictions = model_pipeline.predict(validation_data)
    submission = pd.DataFrame(data = zip(validation_data_ids, predictions), columns = ['PassengerId', 'Survived'])
    submission.to_csv('./submission.csv', index = False)

    return scores.mean(), scores.std()

model_mean_score, model_mean_std = training_knclassifier_nca(x_variables, y_variable, validation_titanic)

print('\nMODEL SUMMARY AND RESULTS')
print('-------------------------\n')

print(f'CV MODEL MEAN ACCURACY: {round(model_mean_score * 100, 2)}%')
print(f'CV MODEL MEAN ACCURACY DEVIATION: {round(model_mean_std * 100, 2)}')
print('ACCURACY OF THE SUBMISSION IN THE KAGGLE LEADERBOARD: 80.382%\n')


MODEL SUMMARY AND RESULTS
-------------------------

CV MODEL MEAN ACCURACY: 82.05%
CV MODEL MEAN ACCURACY DEVIATION: 3.49
ACCURACY OF THE SUBMISSION IN THE KAGGLE LEADERBOARD: 80.382%



In [83]:
# Training a KNeighborsClassifier applying a dimension reduction technique based on near centroids.
def training_knclassifier_nca(independent_variables, dependent_variable, validation_data):

    nca = NeighborhoodComponentsAnalysis(random_state = 42)
    model = KNeighborsClassifier(n_neighbors = 50)

    model_pipeline = Pipeline([('nca', nca), ('knn', model)])
    model_pipeline.fit(independent_variables, dependent_variable)
    scores = cross_val_score(model_pipeline, independent_variables, dependent_variable, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

    predictions = model_pipeline.predict(validation_data)
    submission = pd.DataFrame(data = zip(validation_data_ids, predictions), columns = ['PassengerId', 'Survived'])
    submission.to_csv('./submission18.csv', index = False)

    return scores.mean(), scores.std()

cols = ['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'SocialTitle',
       'Surname', 'TotalCompanions', 'TicketLetters', 'TicketNumbers',
       'TicketSymbols', 'TicketCharacters', 'CabinLetter',
       'Pclass_first class', 'Pclass_second class', 'Pclass_third class',
       'Embarked_cherbourg', 'Embarked_queenstown', 'Embarked_southampton']

model_mean_score, model_mean_std = training_knclassifier_nca(x_variables[cols], y_variable, validation_titanic[cols])

print('\nMODEL SUMMARY AND RESULTS')
print('-------------------------\n')

print(f'CV MODEL MEAN ACCURACY: {round(model_mean_score * 100, 2)}%')
print(f'CV MODEL MEAN ACCURACY DEVIATION: {round(model_mean_std * 100, 2)}')
print('ACCURACY OF THE SUBMISSION IN THE KAGGLE LEADERBOARD: 80.382%\n')


MODEL SUMMARY AND RESULTS
-------------------------

CV MODEL MEAN ACCURACY: 82.61%
CV MODEL MEAN ACCURACY DEVIATION: 2.89
ACCURACY OF THE SUBMISSION IN THE KAGGLE LEADERBOARD: 80.382%



In [81]:
cols = ['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'SocialTitle',
       'Surname', 'TotalCompanions', 'TicketLetters', 'TicketNumbers',
       'TicketSymbols', 'TicketCharacters', 'CabinLetter',
       'Pclass_first class', 'Pclass_second class', 'Pclass_third class',
       'Embarked_cherbourg', 'Embarked_queenstown', 'Embarked_southampton']

nca = NeighborhoodComponentsAnalysis(random_state = 42)
model = KNeighborsClassifier(n_neighbors = 50)
model_pipeline = Pipeline([('nca', nca), ('knn', model)])
model_pipeline.fit(x_variables[cols], y_variable)
scores = cross_val_score(model_pipeline, x_variables[cols], y_variable, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

# y_pred = model_pipeline.predict(x_variables[cols])
# y_pred_test = model_pipeline.predict(validation_titanic[cols])
scores.mean()

0.8260875023539012

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_variables, y_variable, test_size=0.25, random_state=0)

In [None]:
def Confusion_matrix_metrics(TP, FP, FN, TN):
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP / (TP + FN)
#     print('The True Positive Rate is: {:.2%}'.format(TPR))
    # Specificity, selectivity or true negative rate (TNR)
    TNR = TN / (TN + FP)
#     print('The True Negative Rate is: {:.2%}'.format(TNR))
#     print('='*10)

    # accuracy (ACC)
    ACC = (TP + TN) / (TP + TN + FP + FN)
#     print('The Accuracy is: {:.2%}'.format(ACC))
    # balanced accuracy (BA)
    BA = (TPR + TNR) / 2
#     print('The Balanced Accuracy is: {:.2%}'.format(BA))
#     print('='*10)

    # Precision or positive predictive value
    PPV = TP / (TP + FP)
#     print('The Precision is: {:.2%}'.format(PPV))
    # negative predictive value (NPV)
    NPV = TN / (TN + FN)
#     print('The Negative Predictive Value is: {:.2%}'.format(NPV))
    # false discovery rate (FDR)
    FDR = 1 - PPV
#     print('The False Discovery Rate is: {:.2%}'.format(FDR))
    # false omission rate (FOR)
    FOR = 1 - NPV
#     print('The False Omission Rate is: {:.2%}'.format(FOR))
#     print('='*10)

    # prevalence threshold (PT)
    PT = (math.sqrt(TPR*(1 - TNR)) + TNR - 1)/(TPR + TNR - 1)
#     print('The Prevalence Threshold is: {:.2}'.format(PT))
    # F1 score
    F1 = 2*TP / (2*TP + FP + FN)
#     print('The F1 Score is: {:.2}'.format(F1))
    # Matthews correlation coefficient (MCC) or phi coefficient
    MCC = ((TP*TN) - (FP*FN)) / math.sqrt((TP + FP)*(TP + FN)*(TN + FP)*(TN + FN))
#     print('The Matthews Correlation Coefficient is: {:.2}'.format(MCC))
#     print('='*10)

    # False positive rate or False alarm rate
    FPR = FP / (FP + TN)
#     print('The False positive rate is: {:.2}'.format(FPR))
    # False negative rate or Miss Rate
    FNR = FN / (FN + TP)
#     print('The False Negative Rate is: {:.2%}'.format(FNR))
    
    return TPR, TNR, ACC, BA, PPV, NPV, FDR, FOR, PT, F1, MCC, FPR, FNR

In [None]:
from sklearn.metrics import confusion_matrix

def confusion_matrix_func(model, X_train, X_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    cm = confusion_matrix(y_train, y_pred).T
    cm_test = confusion_matrix(y_test, y_pred_test).T
    
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))

    # Plot the heatmap
    im = axes[0].imshow(cm, interpolation='nearest', cmap='Reds', aspect='auto')
    # show all ticks
    axes[0].set_xticks(np.arange(len(cm.tolist())))
    axes[0].set_yticks(np.arange(len(cm.tolist())))
    thresh = cm.max() / 1.5
    # Loop over data dimensions and create text annotations.
    for i in range(len(cm.tolist())):
        for j in range(len(cm.tolist())):
            text = axes[0].text(j, i, cm.tolist()[i][j],
                           ha="center", va="center", size=16,
                           color="white" if cm[i, j] > thresh else "black")
    # Let the horizontal axes labeling appear on top.
    axes[0].xaxis.set_ticks_position('top')
    axes[0].xaxis.set_label_position('top')
    axes[0].set_xlabel('Actual value', size=16)
    axes[0].set_ylabel('Predicted value', size=16)
    axes[0].set_title("Train", fontsize=20)
    
    # Plot the heatmap
    im = axes[1].imshow(cm_test, interpolation='nearest', cmap='Reds', aspect='auto')
    # show all ticks
    axes[1].set_xticks(np.arange(len(cm_test.tolist())))
    axes[1].set_yticks(np.arange(len(cm_test.tolist())))
    thresh = cm_test.max() / 1.5
    # Loop over data dimensions and create text annotations.
    for i in range(len(cm_test.tolist())):
        for j in range(len(cm_test.tolist())):
            text = axes[1].text(j, i, cm_test.tolist()[i][j],
                           ha="center", va="center", size=16,
                           color="white" if cm[i, j] > thresh else "black")
    # Let the horizontal axes labeling appear on top.
    axes[1].xaxis.set_ticks_position('top')
    axes[1].xaxis.set_label_position('top')
    axes[1].set_xlabel('Actual value', size=16)
    axes[1].set_title("Test", fontsize=20)

    fig.tight_layout()
    fig.show()
    
    # Calculating False Positives (FP), False Negatives (FN), True Positives (TP) & True Negatives (TN)
    TP, FP, FN, TN = cm.ravel()
    TPR, TNR, ACC, BA, PPV, NPV, FDR, FOR, PT, F1, MCC, FPR, FNR = Confusion_matrix_metrics(TP, FP, FN, TN)
    TP_test, FP_test, FN_test, TN_test = cm_test.ravel()
    (TPR_test, TNR_test, ACC_test, BA_test, PPV_test, NPV_test, FDR_test, FOR_test, PT_test, F1_test, 
     MCC_test, FPR_test, FNR_test) = Confusion_matrix_metrics(TP_test, FP_test, FN_test, TN_test)
    
    df_cm_metrics = pd.DataFrame({
    'Model': ['True Positive Rate', 'True Negative Rate', 'Accuracy', 'Balanced Accuracy', 'Precision', 
              'Negative Predictive Value', 'False Discovery Rate', 'False Omission Rate', 
              'Prevalence Threshold', 'F1 Score', 'Matthews Correlation Coefficient', 
              'False positive rate', 'False Negative Rate'],
    'Train': [TPR, TNR, ACC, BA, PPV, NPV, FDR, FOR, PT, F1, MCC, FPR, FNR],
    'Test': [TPR_test, TNR_test, ACC_test, BA_test, PPV_test, NPV_test, FDR_test, FOR_test, PT_test, F1_test, 
     MCC_test, FPR_test, FNR_test],
    })
    
    return df_cm_metrics

In [None]:
# Training a KNeighborsClassifier applying a dimension reduction technique based on near centroids.
def training_knclassifier_nca(independent_variables, dependent_variable, validation_data):

    nca = NeighborhoodComponentsAnalysis(random_state = 42)
    model = KNeighborsClassifier(n_neighbors = 50)

    model_pipeline = Pipeline([('nca', nca), ('knn', model)])
    model_pipeline.fit(independent_variables, dependent_variable)
    scores = cross_val_score(model_pipeline, independent_variables, dependent_variable, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

    predictions = model_pipeline.predict(validation_data)
    submission = pd.DataFrame(data = zip(validation_data_ids, predictions), columns = ['PassengerId', 'Survived'])
    submission.to_csv('./submission.csv', index = False)

    return scores.mean(), scores.std()

model_mean_score, model_mean_std = training_knclassifier_nca(x_variables, y_variable, validation_titanic)

print('\nMODEL SUMMARY AND RESULTS')
print('-------------------------\n')

print(f'CV MODEL MEAN ACCURACY: {round(model_mean_score * 100, 2)}%')
print(f'CV MODEL MEAN ACCURACY DEVIATION: {round(model_mean_std * 100, 2)}')
print('ACCURACY OF THE SUBMISSION IN THE KAGGLE LEADERBOARD: 80.382%\n')

In [None]:
X_test.head()

In [None]:
X_test.columns

In [None]:
cols = ['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'SocialTitle', 'NameType',
       'Surname', 'TotalCompanions', 'TicketLetters', 'TicketNumbers',
       'TicketSymbols', 'TicketCharacters', 'CabinLetter',
       'Pclass_first class', 'Pclass_second class', 'Pclass_third class',
       'Embarked_cherbourg', 'Embarked_queenstown', 'Embarked_southampton']

nca = NeighborhoodComponentsAnalysis(random_state = 42)
model = KNeighborsClassifier(n_neighbors = 50)
model_pipeline = Pipeline([('nca', nca), ('knn', model)])
model_pipeline.fit(X_train[cols], y_train)
scores = cross_val_score(model_pipeline, X_test[cols], y_test, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

y_pred = model_pipeline.predict(X_train[cols])
y_pred_test = model_pipeline.predict(X_test[cols])
scores.mean()

In [None]:
import warnings
warnings.filterwarnings('ignore')

import math, time, datetime
import numpy as np 
import pandas as pd
# display all of the columns
pd.set_option('display.max_columns', None)
# from scipy import stats

from matplotlib import pyplot as plt
%matplotlib inline
df_cm_metrics = confusion_matrix_func(model_pipeline, X_train, X_test)

In [None]:
df_cm_metrics

In [None]:
df_cm_metrics = confusion_matrix_func(clf_xgb, X_train, X_test)

In [None]:
df_cm_metrics

In [None]:
nca = NeighborhoodComponentsAnalysis(random_state = 42)
model = KNeighborsClassifier(n_neighbors = 50)
model_pipeline = Pipeline([('nca', nca), ('knn', model)])
model_pipeline.fit(X_train, y_train)

params = {}

clf_knc = GridSearchCV(estimator=model_pipeline, cv=10, scoring='accuracy', param_grid=params).fit(X_train, y_train)
print('best prarams:', clf_knc.best_params_)

acc_log = clf_knc.best_score_
acc_log_test = clf_knc.score(X_test, y_test)
print('The Accuracy  on the training dataset is: {:.1%}'.format(acc_log))
print('The Accuracy  on the testing dataset is: {:.1%}'.format(acc_log_test))

In [None]:
# 0.80382
nca = NeighborhoodComponentsAnalysis(random_state = 42)
model = KNeighborsClassifier(n_neighbors = 50)

model_pipeline = Pipeline([('nca', nca), ('knn', model)])
model_pipeline.fit(x_variables, y_variable)
scores = cross_val_score(model_pipeline, x_variables, y_variable, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

predictions = model_pipeline.predict(validation_titanic)
submission = pd.DataFrame(data = zip(validation_data_ids, predictions), columns = ['PassengerId', 'Survived'])
submission.to_csv('./submission.csv', index = False)

print(scores.mean(), scores.std())

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

xgb = XGBClassifier(booster='gbtree', eval_metric='logloss',eta=0.05, gamma=0, max_depth=10, 
                    min_child_weight=2.7, max_delta_step=2, subsample=1, colsample_bytree=1, alpha=0, 
                    tree_method='approx', sketch_eps=0.01, scale_pos_weight=1)

params = {}

clf_xgb = GridSearchCV(estimator=xgb, cv=5, scoring='accuracy', param_grid=params).fit(X_train, y_train)
print('best prarams:', clf_xgb.best_params_)

acc_xgb = clf_xgb.best_score_
acc_xgb_test = clf_xgb.score(X_test, y_test)
print('The Accuracy  on the training dataset is: {:.1%}'.format(acc_xgb))
print('The Accuracy  on the testing dataset is: {:.1%}'.format(acc_xgb_test))

In [None]:
# 0.76794


from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

xgb = XGBClassifier(booster='gbtree', eval_metric='logloss',eta=0.05, gamma=0, max_depth=10, 
                    min_child_weight=2.7, max_delta_step=2, subsample=1, colsample_bytree=1, alpha=0, 
                    tree_method='approx', sketch_eps=0.01, scale_pos_weight=1)

model_pipeline = Pipeline([('xgb', xgb)])
model_pipeline.fit(X_train, y_train)
scores = cross_val_score(model_pipeline, X_train, y_train, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

predictions = model_pipeline.predict(validation_titanic)
submission = pd.DataFrame(data = zip(validation_data_ids, predictions), columns = ['PassengerId', 'Survived'])
submission.to_csv('./titanic_submission_kaggle_20210716.csv', index = False)

print(scores.mean(), scores.std())



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(solver='liblinear', C=1, penalty='l2', max_iter=50, multi_class='auto')

params = {}

clf_log = GridSearchCV(estimator=log, cv=10, scoring='accuracy', param_grid=params).fit(X_train, y_train)
print('best prarams:', clf_log.best_params_)

acc_log = clf_log.best_score_
acc_log_test = clf_log.score(X_test, y_test)
print('The Accuracy  on the training dataset is: {:.1%}'.format(acc_log))
print('The Accuracy  on the testing dataset is: {:.1%}'.format(acc_log_test))

In [None]:
# 0.78947

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(solver='liblinear', C=1, penalty='l2', max_iter=50, multi_class='auto')

model_pipeline = Pipeline([('log', log)])
model_pipeline.fit(X_train, y_train)
scores = cross_val_score(model_pipeline, X_train, y_train, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

predictions = model_pipeline.predict(validation_titanic)
submission = pd.DataFrame(data = zip(validation_data_ids, predictions), columns = ['PassengerId', 'Survived'])
submission.to_csv('./titanic_submission_kaggle_20210716.csv', index = False)

print(scores.mean(), scores.std())



In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    # Return the ccuracy classification score

    # We will run the model with 10 different sets and we will get 10 results
    cv = KFold(n_splits=10, random_state=0, shuffle=True)
    # Applying Cross Validation to solve possible overfitting problem
    scores_train = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv)
    acc_train = scores_train.mean()
    scores_test = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv)
    acc_test = scores_test.mean()

#     print('The Accuracy  on the training dataset is: {:.1%}'.format(acc_train))
#     print('The Accuracy  on the testing dataset is: {:.1%}'.format(acc_test))
    
    return acc_train, acc_test

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

acc_list = []
acc_test_list = []
for k in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=k)
    acc_knn, acc_knn_test = evaluate_model(knn, X_train, X_test, y_train, y_test)
    acc_list.append(acc_knn)
    acc_test_list.append(acc_knn_test)
    
plt.figure(figsize=(8,4))
plt.plot(range(1,40),acc_list,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.plot(range(1,40),acc_test_list,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='green', markersize=10)
plt.title('Accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()

In [None]:
max(k_max_list)

In [None]:
k_max_list = [i for i in acc_list]
k_max = acc_list.index(max(k_max_list))+1
print(k_max)

k_test_max_list = [i for i in acc_test_list]
k_test_max = acc_test_list.index(max(k_test_max_list))+1
print(k_test_max)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=k_test_max, weights='distance', p=1)
# knn = KNeighborsClassifier(algorithm='ball_tree', weights='distance', p=1)
params = {
#     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#     'leaf_size': [5, 10, 15, 20, 25, 30, 35],
#     'weights': ['uniform', 'distance'],
#     'p': [1, 2, 3, 4, 5, 6],
         }

clf_knn = GridSearchCV(estimator=knn, cv=10, scoring='accuracy', param_grid=params).fit(X_train, y_train)
print('best prarams:', clf_knn.best_params_)

acc_knn = clf_knn.best_score_
acc_knn_test = clf_knn.score(X_test, y_test)
print('The Accuracy  on the training dataset is: {:.1%}'.format(acc_knn))
print('The Accuracy  on the testing dataset is: {:.1%}'.format(acc_knn_test))

In [None]:
# 0.75837
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=k_test_max, weights='distance', p=1)

model_pipeline = Pipeline([('knn', knn)])
model_pipeline.fit(X_train, y_train)
scores = cross_val_score(model_pipeline, X_train, y_train, cv = StratifiedKFold(n_splits = 5), n_jobs = -1)

predictions = model_pipeline.predict(validation_titanic)
submission = pd.DataFrame(data = zip(validation_data_ids, predictions), columns = ['PassengerId', 'Survived'])
submission.to_csv('./titanic_submission_kaggle_20210716.csv', index = False)

print(scores.mean(), scores.std())

