<a id='packages'></a>
## Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

<a id='wrangling'></a>
## Data Wrangling
> First, we need to take a look at our dataset, its data types and check whether or not it has missing values.

In [146]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [147]:
train_df.shape, test_df.shape

((891, 12), (418, 11))

In [148]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [149]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


There are too many null values in Cabin; therefore, it doesn't add value, so we'll just drop it.

In [150]:
train_df.drop(columns=['Cabin'], inplace=True)
test_df.drop(columns=['Cabin'], inplace=True)

### Feature Engineering

Adding Title feature to the dataset.

In [151]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.')

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr


In [152]:
title_gb = train_df.groupby(['Title']).Title.count().sort_values(ascending=False)
title_gb

Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Col           2
Mlle          2
Mme           1
Ms            1
Capt          1
Lady          1
Jonkheer      1
Don           1
Countess      1
Sir           1
Name: Title, dtype: int64

We can replace the titles that has the least frequency with "Rare".

In [153]:
for data in combine:
    for i in range(4, len(title_gb.index)):
        data['Title'].replace(title_gb.index[i], 'Rare', inplace=True)

train_df.groupby(['Title']).Title.count().sort_values(ascending=False)

Title
Mr        517
Miss      182
Mrs       125
Master     40
Rare       27
Name: Title, dtype: int64

In [154]:
train_df.groupby(['Title'])['Survived'].mean().sort_values()

Title
Mr        0.156673
Rare      0.444444
Master    0.575000
Miss      0.697802
Mrs       0.792000
Name: Survived, dtype: float64

Now we can convert the categorical titles to ordinal.

In [155]:
def label_encoding(column):
    list_ = train_df.groupby([column])['Survived'].mean().sort_values().index

    for data in combine:
        for i in range(len(list_)):
            data[column].replace(list_[i], i, inplace=True)

In [156]:
train_df.Title.isnull().sum(), test_df.Title.isnull().sum()

(0, 0)

In [157]:
label_encoding('Title')


train_df.groupby(['Title'])['Survived'].mean().sort_values()

Title
0    0.156673
1    0.444444
2    0.575000
3    0.697802
4    0.792000
Name: Survived, dtype: float64

In [158]:

train_df.Title.unique()

array([0, 4, 3, 2, 1])

No missing ticket

In [159]:
for data in combine:
    data['Ticket'] = data.Ticket.str.replace('/', '')
    data['Ticket'] = data.Ticket.str.replace('.', '')
    data['Ticket'] = data.Ticket.str.extract('([A-Za-z|0-9]+)\ ')
    data.Ticket.replace('STONO', 'STONO2', inplace=True)
    data.Ticket.fillna('NoLetter', inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [160]:
train_df.Ticket.sort_values().unique()

array(['A4', 'A5', 'AS', 'C', 'CA', 'CASOTON', 'FC', 'FCC', 'Fa',
       'NoLetter', 'PC', 'PP', 'PPP', 'SC', 'SCA4', 'SCAH', 'SCOW',
       'SCPARIS', 'SCParis', 'SOC', 'SOP', 'SOPP', 'SOTONO2', 'SOTONOQ',
       'SP', 'STONO2', 'SWPP', 'WC', 'WEP'], dtype=object)

In [161]:
train_df.groupby(['Ticket'])['Survived'].mean().sort_values()

Ticket
A4          0.000000
SP          0.000000
SOTONO2     0.000000
SOPP        0.000000
SOP         0.000000
SCOW        0.000000
Fa          0.000000
SCA4        0.000000
FC          0.000000
AS          0.000000
CASOTON     0.000000
A5          0.095238
WC          0.100000
SOTONOQ     0.133333
SOC         0.166667
WEP         0.333333
CA          0.341463
NoLetter    0.383459
C           0.400000
SCPARIS     0.428571
STONO2      0.444444
PPP         0.500000
SCParis     0.500000
PC          0.650000
PP          0.666667
SCAH        0.666667
FCC         0.800000
SC          1.000000
SWPP        1.000000
Name: Survived, dtype: float64

Putting the tickets that have the same avg survival in the same category. This is a preparation to convert the Ticket feature to ordinal.

In [162]:
for data in combine:
    data.Ticket.replace(['A4','SCA4','SOP','SOPP','SOTONO2','SCOW','SP','Fa','CASOTON','AS','FC'], 'T0', inplace=True) # T0 stands for Ticket 0
    data.Ticket.replace(['PPP','SCParis'], 'T11', inplace=True)
    data.Ticket.replace(['PP','SCAH'], 'T13', inplace=True)
    data.Ticket.replace(['SC','SWPP'], 'T15', inplace=True)

train_df.groupby(['Ticket'])['Survived'].mean().sort_values()

Ticket
T0          0.000000
A5          0.095238
WC          0.100000
SOTONOQ     0.133333
SOC         0.166667
WEP         0.333333
CA          0.341463
NoLetter    0.383459
C           0.400000
SCPARIS     0.428571
STONO2      0.444444
T11         0.500000
PC          0.650000
T13         0.666667
FCC         0.800000
T15         1.000000
Name: Survived, dtype: float64

Now encoding the Ticket feature.

In [163]:
label_encoding('Ticket')

train_df.groupby(['Ticket'])['Survived'].mean().sort_values()

Ticket
0     0.000000
1     0.095238
2     0.100000
3     0.133333
4     0.166667
5     0.333333
6     0.341463
7     0.383459
8     0.400000
9     0.428571
10    0.444444
11    0.500000
12    0.650000
13    0.666667
14    0.800000
15    1.000000
Name: Survived, dtype: float64

### Handling outliers

In [164]:
def get_limits(data, column):
    quartile1 = data[column].quantile(.25)
    quartile3 = data[column].quantile(.75)
    iqr = quartile3 - quartile1
    upper_limit = quartile3 + 1.5 * iqr
    lower_limit = quartile1 - 1.5 * iqr

    outliers = data[(data[column] >= upper_limit) | (data[column] <= lower_limit)]

    outliers_proportion = len(outliers)/len(data)

    print(f'{column} outlier limits: {lower_limit}, {upper_limit}')
    print(f'Outliers proportion: {outliers_proportion}')

    if(outliers_proportion <= 0.05): # In case we have many outliers, we won't drop them.
        print(f'You can drop {column} outliers.')
        return lower_limit, upper_limit
    else:
        print(f'Do not drop {column} outliers!')
        return 0, 0

In [165]:
def drop_outliers(data, column, lower_limit, upper_limit):
    data = data[(data[column] < upper_limit) & (data[column] > lower_limit)]
    return data

Checking whether it's better to drop the outliers or not.

In [166]:
age_lower_limit, age_upper_limit = get_limits(train_df, 'Age')
print()
fare_lower_limit, fare_upper_limit = get_limits(train_df, 'Fare')

Age outlier limits: -6.6875, 64.8125
Outliers proportion: 0.012345679012345678
You can drop Age outliers.

Fare outlier limits: -26.724, 65.6344
Outliers proportion: 0.13019079685746351
Do not drop Fare outliers!


Dropping outliers.

In [167]:
train_df = drop_outliers(train_df, 'Age', age_lower_limit, age_upper_limit)
combine = [train_df, test_df] # beacause train_df is re-initialized
train_df.shape

(703, 12)

Before cleaning, we can drop the columns we don't need.

In [168]:
for data in combine: 
    data.drop(columns=['Name'], inplace=True)
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,1,0,3,male,22.0,1,0,1,7.25,S,0
1,2,1,1,female,38.0,1,0,12,71.2833,C,4
2,3,1,3,female,26.0,0,0,10,7.925,S,3
3,4,1,1,female,35.0,1,0,7,53.1,S,4
4,5,0,3,male,35.0,0,0,7,8.05,S,0


<a id='cleaning'></a>
### Data cleaning 

Check if there are any duplicated rows.

In [169]:
train_df.duplicated().sum(), test_df.duplicated().sum() 

(0, 0)

The train set has missing values in Age and Embarked columns.

In [170]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 703 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  703 non-null    int64  
 1   Survived     703 non-null    int64  
 2   Pclass       703 non-null    int64  
 3   Sex          703 non-null    object 
 4   Age          703 non-null    float64
 5   SibSp        703 non-null    int64  
 6   Parch        703 non-null    int64  
 7   Ticket       703 non-null    int64  
 8   Fare         703 non-null    float64
 9   Embarked     701 non-null    object 
 10  Title        703 non-null    int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 65.9+ KB


The train set has missing values in Age, Fare columns.

In [171]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Ticket       418 non-null    object 
 7   Fare         417 non-null    float64
 8   Embarked     418 non-null    object 
 9   Title        418 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 32.8+ KB


Something wrong with Title in test set. Its Dtype is object.

In [172]:
test_df.Title.unique()

array([0, 4, 3, 2, 1, 'Dona'], dtype=object)

Function to replace string values in features that are supposed to be numeric.

In [173]:
def replace_str(data, column, numeric_value):
    list_ = data[column].str.extract('([A-Za-z]+[\d]*)').dropna().iloc
    for i in range(len(list_[:])):
        data[column].replace(list_[i][0], numeric_value, inplace=True)

In [174]:
replace_str(test_df, 'Title', train_df.Title.max()) 
test_df.Title.unique()

array([0, 4, 3, 2, 1])

Same with Ticket.

In [175]:
test_df.Ticket.unique()

array([7, 0, 5, 9, 10, 12, 8, 1, 13, 6, 2, 3, 'SCA3', 14, 'STONOQ', 4,
       'AQ4', 'A', 15, 11, 'LP', 'AQ3'], dtype=object)

In [176]:
replace_str(test_df, 'Ticket', train_df.Ticket.max())
test_df.Ticket.unique()

array([ 7,  0,  5,  9, 10, 12,  8,  1, 13,  6,  2,  3, 15, 14,  4, 11])

In [177]:
test_df.info() # Solved

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Ticket       418 non-null    int64  
 7   Fare         417 non-null    float64
 8   Embarked     418 non-null    object 
 9   Title        418 non-null    int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 32.8+ KB


In [178]:
full_df = pd.concat([train_df, test_df])

for i in range(full_df.Title.nunique()):  #Iterating on each title number
    full_cond = full_df['Title']==i  # The condition in full data
    train_cond = train_df['Title']==i
    test_cond = test_df['Title']==i

    # Filling null Age values with the mean Age in every Title category (from the full dataset)
    train_df.loc[train_cond, 'Age'] = train_df.loc[train_cond, 'Age'].fillna(full_df.loc[full_cond, 'Age'].mean())
    test_df.loc[test_cond, 'Age'] = test_df.loc[test_cond, 'Age'].fillna(full_df.loc[full_cond, 'Age'].mean())

for i in range(1, full_df.Pclass.nunique()+1):
    full_cond = full_df['Pclass']==i
    test_cond = test_df['Pclass']==i
    train_cond = train_df['Pclass']==i

    # Filling null Fare values with the mean Fare in every Pclass category (from the full dataset)
    test_df.loc[test_cond, 'Fare'] = test_df.loc[test_cond, 'Fare'].fillna(full_df.loc[full_cond, 'Fare'].mean())


for i in range(full_df.Ticket.nunique()):
    full_cond = full_df['Ticket']==i  # The condition in full data
    train_cond = train_df['Ticket']==i

    # Filling null Embarked values with the mode Embarked in every Ticket category (from the full dataset)
    train_df.loc[train_cond, 'Embarked'] = train_df.loc[train_cond, 'Embarked'].fillna(full_df.loc[full_cond, 'Embarked'].mode()[0])

In [179]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 703 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  703 non-null    int64  
 1   Survived     703 non-null    int64  
 2   Pclass       703 non-null    int64  
 3   Sex          703 non-null    object 
 4   Age          703 non-null    float64
 5   SibSp        703 non-null    int64  
 6   Parch        703 non-null    int64  
 7   Ticket       703 non-null    int64  
 8   Fare         703 non-null    float64
 9   Embarked     703 non-null    object 
 10  Title        703 non-null    int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 65.9+ KB


In [180]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Ticket       418 non-null    int64  
 7   Fare         418 non-null    float64
 8   Embarked     418 non-null    object 
 9   Title        418 non-null    int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 32.8+ KB


No null values now.

We have to encode the Sex feature to be used in the model.

In [181]:
# label encoding because it's binary feature.
for data in combine:
    data.Sex = data.Sex.map({'male': 1, 'female': 0})

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,1,0,3,1,22.0,1,0,1,7.25,S,0
1,2,1,1,0,38.0,1,0,12,71.2833,C,4
2,3,1,3,0,26.0,0,0,10,7.925,S,3
3,4,1,1,0,35.0,1,0,7,53.1,S,4
4,5,0,3,1,35.0,0,0,7,8.05,S,0


In [182]:

train_df
train_df.groupby(['Embarked'])['Survived'].mean()

Embarked
C    0.622047
Q    0.307692
S    0.367273
Name: Survived, dtype: float64

Encoding the Embarked feature.

In [183]:
# one-hot encoding because it's not ordinal and only 3 categories.
train_df[['C', 'Q', 'S']] = pd.get_dummies(train_df['Embarked'])
test_df[['C', 'Q', 'S']] = pd.get_dummies(test_df['Embarked'])
train_df.drop(columns=['Embarked'], inplace=True)
test_df.drop(columns=['Embarked'], inplace=True)
train_df.drop(columns=['S'], inplace=True)
test_df.drop(columns=['S'], inplace=True)

<a id='ms'></a>
## Model Selection

In [184]:
train_df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title,C,Q
0,1,0,3,1,22.0,1,0,1,7.25,0,0,0


In [185]:
test_df.head(1)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title,C,Q
0,892,3,1,34.5,0,0,7,7.8292,0,0,1


In [186]:
X_train = train_df.iloc[:, 2:10] 
X_test = test_df.iloc[:, 1:9]

y_train = train_df['Survived']

Scaling for the KNeighborsClassifier.

In [187]:
std_scaler = StandardScaler()
Xs_train = std_scaler.fit_transform(X_train)
Xs_test = std_scaler.transform(X_test)

In [188]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title
0,3,1,22.0,1,0,1,7.25,0
1,1,0,38.0,1,0,12,71.2833,4
2,3,0,26.0,0,0,10,7.925,3
3,1,0,35.0,1,0,7,53.1,4
4,3,1,35.0,0,0,7,8.05,0


In [189]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Title
0,3,1,34.5,0,0,7,7.8292,0
1,3,0,47.0,1,0,7,7.0,4
2,2,1,62.0,0,0,7,9.6875,0
3,3,1,27.0,0,0,7,8.6625,0
4,3,0,22.0,1,1,7,12.2875,4


Function that trains the model with every parameter value of your choice and picks the best parameters with the best degree of polynomial based on ROC AUC score. In otherwords this function tune the parameters and degree of polynomial to give you the best score.

In [190]:
def tune_model_grid(model, params, min_degree, mx_degree, kfolds, X):
    max_degree = mx_degree

    best_avg_scores = []  

    min_degree -= 1

    for i in range(min_degree, max_degree):
        degree = i +1
        poly_train = PolynomialFeatures(degree, include_bias=False).fit_transform(X)
        # We are using the ROC AUC score because it's a binary classification problem.
        grid_search = GridSearchCV(model, params, cv=kfolds, return_train_score=False, verbose=True, n_jobs=-1, scoring = 'roc_auc') 
        grid_search.fit(poly_train, y_train)
        
        best_avg_scores.append({
            'model': grid_search,
            'best_score': grid_search.best_score_,
            'degree': degree
        })

    scores_df = pd.DataFrame(best_avg_scores)

    scores_df.sort_values('best_score', ascending=False, inplace=True)
    
    print()
    print('Best params:')
    print(scores_df.model.iloc[0].best_estimator_)
    print()
    print(scores_df.iloc[0, 1:])

    

    return scores_df.iloc[0, :]

In [199]:
best_scores_df = pd.DataFrame(columns=['model', 'best_score', 'degree'])
best_scores_df

Unnamed: 0,model,best_score,degree


In [200]:
best_model = tune_model_grid(LogisticRegression(), # the solver is liblinear because the data is small
                                 {'random_state': [0],
                                  'max_iter' : [1000],
                                  'penalty' : ['l1', 'l2'],
                                  'C' : [1e-4, 1e-3, 1e-2],
                                  'solver' : ['liblinear']},
                                  1, 3, 10, X_train)


best_scores_df.loc[len(best_scores_df)] = best_model

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Fitting 10 folds for each of 6 candidates, totalling 60 fits
Fitting 10 folds for each of 6 candidates, totalling 60 fits

Best params:
LogisticRegression(C=0.001, max_iter=1000, random_state=0, solver='liblinear')

best_score    0.876357
degree               2
Name: 1, dtype: object


In [201]:
best_model = tune_model_grid(XGBClassifier(),
                                 {'random_state': [0],
                                  'n_estimators': [70, 100, 130],
                                  'max_depth': [2, 3, 4],
                                  'learning_rate':[0.16, 0.19, 0.22],
                                  'reg_lambda':[0.4, 0.7, 1],
                                  'reg_alpha':[0.3, 0.6, 0.9],
                                  'gamma': [0, 0.5, 1]},
                                  1, 1, 10, X_train)

best_scores_df.loc[len(best_scores_df)] = best_model

Fitting 10 folds for each of 729 candidates, totalling 7290 fits

Best params:
XGBClassifier(learning_rate=0.19, reg_alpha=0.6, reg_lambda=0.7)

best_score    0.896539
degree               1
Name: 0, dtype: object


In [202]:
best_model = tune_model_grid(RandomForestClassifier(),
                                 {'random_state': [0],
                                  'n_estimators': [70, 100, 130],
                                  'criterion':['gini','entropy'],
                                  'max_depth': [5, 6, 7],
                                  'min_samples_leaf': [1, 2, 3],
                                  'min_samples_split': [2, 3, 4]},
                                  1, 1, 10, X_train)

best_scores_df.loc[len(best_scores_df)] = best_model

Fitting 10 folds for each of 162 candidates, totalling 1620 fits

Best params:
RandomForestClassifier(criterion='entropy', max_depth=6, random_state=0)

best_score    0.891058
degree               1
Name: 0, dtype: object


In [203]:
best_model = tune_model_grid(KNeighborsClassifier(),
                                 {'n_neighbors': range(4, 25, 1),
                                  'weights': ['uniform', 'distance'],
                                  'leaf_size': range(1, 20, 1)},
                                  1, 1, 10, Xs_train)

best_scores_df.loc[len(best_scores_df)] = best_model

Fitting 10 folds for each of 798 candidates, totalling 7980 fits

Best params:
KNeighborsClassifier(leaf_size=5, n_neighbors=22)

best_score    0.888036
degree               1
Name: 0, dtype: object


In [204]:
best_scores_df.sort_values('best_score', inplace=True, ascending=False)
best_scores_df

Unnamed: 0,model,best_score,degree
1,"GridSearchCV(cv=10, estimator=XGBClassifier(),...",0.896539,1
2,"GridSearchCV(cv=10, estimator=RandomForestClas...",0.891058,1
3,"GridSearchCV(cv=10, estimator=KNeighborsClassi...",0.888036,1
0,"GridSearchCV(cv=10, estimator=LogisticRegressi...",0.876357,2


In [205]:
for i in range(best_scores_df.shape[0]):

    degree = best_scores_df.degree.iloc[i]

    if((str(best_scores_df.model.iloc[i].estimator) == str(KNeighborsClassifier()))):
        X_test_ = PolynomialFeatures(degree, include_bias=False).fit_transform(Xs_test)
    else:
        X_test_ = PolynomialFeatures(degree, include_bias=False).fit_transform(X_test)

    y_test = best_scores_df.model.iloc[i].best_estimator_.predict(X_test_)

    submission_data = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': y_test})

    submission_data.to_csv(f'submission{i+1}.csv', index=False)

# XGBClassifier score: 0.76315
# RandomForestClassifier score: 0.78468
# KNeighborsClassifier score: 0.79186
# LogisticRegression score: 0.75598