# Titanic Survivor

## Importing Libraries

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.neighbors import KNeighborsRegressor

import scipy.stats

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

from sklearn import metrics
from sklearn.metrics import r2_score

## Load Data

In [3]:
raw_train = pd.read_csv('train.csv')
raw_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
raw_test = pd.read_csv('test.csv')
raw_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Combining Data

In [8]:
target = raw_train['Survived']
test_id = raw_test['PassengerId']
train1 = raw_train.drop(['PassengerId','Survived'],axis=1)
test1 = raw_test.drop(['PassengerId'],axis=1)

data1 = pd.concat([train1, test1], axis=0).reset_index(drop=True)
data1

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


## Cleaning Data

In [11]:
data2 = data1.copy()

In [13]:
data2.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [14]:
data2.select_dtypes(np.number)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.2500
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.9250
3,1,35.0,1,0,53.1000
4,3,35.0,0,0,8.0500
...,...,...,...,...,...
1304,3,,0,0,8.0500
1305,1,39.0,0,0,108.9000
1306,3,38.5,0,0,7.2500
1307,3,,0,0,8.0500


In [21]:
data2.select_dtypes('object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
...,...,...,...,...,...
1304,"Spector, Mr. Woolf",male,A.5. 3236,,S
1305,"Oliva y Ocana, Dona. Fermina",female,PC 17758,C105,C
1306,"Saether, Mr. Simon Sivertsen",male,SOTON/O.Q. 3101262,,S
1307,"Ware, Mr. Frederick",male,359309,,S


In [23]:
data2['ticket_1st_letter'] = data2['Ticket'].str[0]
data2['ticket_1st_letter'].value_counts()

3    429
2    278
1    210
P     98
S     98
C     77
A     42
W     19
7     13
F     13
4     11
6      9
L      5
5      3
9      2
8      2
Name: ticket_1st_letter, dtype: int64

In [24]:
data2['cabin_1st_letter'] = data2['Cabin'].str[0]
data2['cabin_1st_letter'].value_counts()

C    94
B    65
D    46
E    41
A    22
F    21
G     5
T     1
Name: cabin_1st_letter, dtype: int64

In [25]:
data2

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_1st_letter,cabin_1st_letter
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,A,
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,P,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,S,
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1,C
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,3,
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,A,
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,P,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,S,
1307,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,3,


In [26]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Pclass             1309 non-null   int64  
 1   Name               1309 non-null   object 
 2   Sex                1309 non-null   object 
 3   Age                1046 non-null   float64
 4   SibSp              1309 non-null   int64  
 5   Parch              1309 non-null   int64  
 6   Ticket             1309 non-null   object 
 7   Fare               1308 non-null   float64
 8   Cabin              295 non-null    object 
 9   Embarked           1307 non-null   object 
 10  ticket_1st_letter  1309 non-null   object 
 11  cabin_1st_letter   295 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 122.8+ KB


In [33]:
for column in [
    'Embarked',
    'Fare'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

In [34]:
data2.isnull().sum()

Pclass                  0
Name                    0
Sex                     0
Age                   263
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Cabin                1014
Embarked                0
ticket_1st_letter       0
cabin_1st_letter     1014
dtype: int64

In [35]:
def knn_impute(df,na_target):
    df = df.copy()
    
    numeric_df = df.select_dtypes(np.number)    
    non_na_columns = numeric_df.loc[:,numeric_df.isna().sum() == 0].columns
    
    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train,y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[numeric_df[na_target].isna() == True, na_target] = y_pred
    
    return df

In [36]:
data3 = data2.copy()

In [41]:
data3 = knn_impute(data3,'Age')

In [42]:
data3.isnull().sum()

Pclass                  0
Name                    0
Sex                     0
Age                     0
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Cabin                1014
Embarked                0
ticket_1st_letter       0
cabin_1st_letter     1014
dtype: int64

In [43]:
data4 = data3.copy()

In [46]:
scipy.stats.skew(data4.select_dtypes(np.number))

array([-0.59796089,  0.46520164,  3.8398138 ,  3.66487242,  4.36420541])

In [47]:
skew_df = pd.DataFrame(data4.select_dtypes(np.number).columns, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data4[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False)
skew_df

Unnamed: 0,Feature,Skew,Absolute Skew,Skewed
0,Pclass,-0.597961,0.597961,True
1,Age,0.465202,0.465202,False
2,SibSp,3.839814,3.839814,True
3,Parch,3.664872,3.664872,True
4,Fare,4.364205,4.364205,True


In [48]:
skew_df.query('Skewed == True')['Feature']

0    Pclass
2     SibSp
3     Parch
4      Fare
Name: Feature, dtype: object

In [49]:
for column in skew_df.query('Skewed == True')['Feature'].values:
    data4[column] = np.log1p(data4[column])

In [50]:
skew_df = pd.DataFrame(data4.select_dtypes(np.number).columns, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data4[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False)
skew_df

Unnamed: 0,Feature,Skew,Absolute Skew,Skewed
0,Pclass,-0.741123,0.741123,True
1,Age,0.465202,0.465202,False
2,SibSp,1.634945,1.634945,True
3,Parch,1.787711,1.787711,True
4,Fare,0.542519,0.542519,True


In [56]:
data5 = data4.copy()
data5 = data5.drop(['Name','Ticket','Cabin'],axis=1)
data5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Pclass             1309 non-null   float64
 1   Sex                1309 non-null   object 
 2   Age                1309 non-null   float64
 3   SibSp              1309 non-null   float64
 4   Parch              1309 non-null   float64
 5   Fare               1309 non-null   float64
 6   Embarked           1309 non-null   object 
 7   ticket_1st_letter  1309 non-null   object 
 8   cabin_1st_letter   295 non-null    object 
dtypes: float64(5), object(4)
memory usage: 92.2+ KB


In [57]:
data5 = pd.get_dummies(data5)
data5

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,...,ticket_1st_letter_S,ticket_1st_letter_W,cabin_1st_letter_A,cabin_1st_letter_B,cabin_1st_letter_C,cabin_1st_letter_D,cabin_1st_letter_E,cabin_1st_letter_F,cabin_1st_letter_G,cabin_1st_letter_T
0,1.386294,22.0,0.693147,0.000000,2.110213,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0.693147,38.0,0.693147,0.000000,4.280593,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1.386294,26.0,0.000000,0.000000,2.188856,1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0.693147,35.0,0.693147,0.000000,3.990834,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,1.386294,35.0,0.000000,0.000000,2.202765,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1.386294,25.4,0.000000,0.000000,2.202765,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1305,0.693147,39.0,0.000000,0.000000,4.699571,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1306,1.386294,38.5,0.000000,0.000000,2.110213,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1307,1.386294,25.4,0.000000,0.000000,2.202765,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [58]:
data6 = data5.copy()

In [59]:
scaler = StandardScaler()
scaler.fit(data6)

data6 = pd.DataFrame(scaler.transform(data6), index=data6.index, columns=data6.columns)

## Split Data

In [60]:
data7 = data6.copy()

In [62]:
train_final = data7.loc[:raw_train.index.max(),:].copy()
test_final = data7.loc[raw_train.index.max()+1:,:].reset_index(drop=True).copy()

In [104]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

kf = KFold(n_splits=5)

In [97]:
from sklearn.linear_model import LogisticRegression

from catboost import CatBoostClassifier

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

from sklearn.ensemble import VotingClassifier


In [106]:
models = {
#     'logreg': LogisticRegression(),
#     'catboost': CatBoostClassifier(verbose=0),
#     'randomForest': RandomForestClassifier(),
#     'lindis': LinearDiscriminantAnalysis(),
#     'neigh': KNeighborsClassifier(),
#     'Gauss': GaussianNB(),
#     'tree':DecisionTreeClassifier(),
#     'XGBR': XGBClassifier(),
#     'svc': SVC(),
    'eclf1': VotingClassifier(estimators=[('cat', CatBoostClassifier(verbose=0)), 
                                          ('rf', RandomForestClassifier()), 
                                          ('xgbt', XGBClassifier())], 
                              voting='hard'),
    
    'eclf2': VotingClassifier(estimators=[('cat', CatBoostClassifier(verbose=0)), 
                                          ('rf', RandomForestClassifier()), 
                                          ('xgbt', XGBClassifier())], 
                              voting='soft',
                              flatten_transform=True,
                             weights=[2,1,1]),
    'eclf3': VotingClassifier(estimators=[('cat', CatBoostClassifier(verbose=0)), 
                                          ('rf', RandomForestClassifier()), 
                                          ('xgbt', XGBClassifier())])
}

In [94]:
for name, model in models.items():
    model.fit(train_final, target)
    print(name + 'trained')

eclf1trained
eclf2trained


In [95]:
results = {}
for name, model in models.items():
    result = cross_val_score(model, train_final, target, scoring='accuracy', cv=kf)
    results[name] = result
results

{'eclf1': array([0.81005587, 0.79213483, 0.85393258, 0.80337079, 0.87078652]),
 'eclf2': array([0.82122905, 0.80337079, 0.87640449, 0.8258427 , 0.87640449])}

In [96]:
for name, result in results.items():
    print('\n-----\n'+name)
    print(np.mean(result))
    print(np.std(result))


-----
eclf1
0.8260561170045821
0.03065678380021344

-----
eclf2
0.8406503044378884
0.03014278359957501


## Predictions

In [98]:
final_predictions = (
    models['eclf2'].predict(test_final)
)
final_predictions

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

## Exporting Solution

In [101]:
# add the file name as string with .csv
def exportSolution(predict,filename):
    new_data_predictions = pd.DataFrame(predict, columns=['Survived'])
    solution = pd.concat([raw_test['PassengerId'],new_data_predictions],axis=1)
    solution.to_csv(filename,index=False)
    return

In [102]:
exportSolution(final_predictions,'results_titanic_000.csv')

## Tuning

In [107]:
params = {'voting':['hard', 'soft'],
          'weights':[(1,1,1), (2,1,1), 
                     (1,2,1), (1,1,2),
                     (1,2,2), (2,1,2), 
                     (2,2,1)]}

#fit gridsearch & print best params
grid = GridSearchCV(models['eclf3'], params)
grid.fit(train_final, target)
print('\n')
print(f'The best params is : {grid.best_params_}')



The best params is : {'voting': 'hard', 'weights': (2, 1, 1)}


In [108]:
eclf3 = VotingClassifier(estimators=[('cat', CatBoostClassifier(verbose=0)), 
                                          ('rf', RandomForestClassifier()), 
                                          ('xgbt', XGBClassifier())],
                         voting= 'hard', weights= (2, 1, 1))

In [114]:
eclf3.fit(train_final, target)
final_predictions2 = (
    eclf3.predict(test_final)
)
final_predictions2

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [115]:
exportSolution(final_predictions2,'results_titanic_001.csv')

baseline: 0.75837 pos:11578

voting tuning: 0.76555 pos:10690

adjust values - voting tuning: 0.76315

In [118]:
data8 = data4.copy()

In [119]:
data8

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_1st_letter,cabin_1st_letter
0,1.386294,"Braund, Mr. Owen Harris",male,22.0,0.693147,0.000000,A/5 21171,2.110213,,S,A,
1,0.693147,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,0.693147,0.000000,PC 17599,4.280593,C85,C,P,C
2,1.386294,"Heikkinen, Miss. Laina",female,26.0,0.000000,0.000000,STON/O2. 3101282,2.188856,,S,S,
3,0.693147,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,0.693147,0.000000,113803,3.990834,C123,S,1,C
4,1.386294,"Allen, Mr. William Henry",male,35.0,0.000000,0.000000,373450,2.202765,,S,3,
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1.386294,"Spector, Mr. Woolf",male,25.4,0.000000,0.000000,A.5. 3236,2.202765,,S,A,
1305,0.693147,"Oliva y Ocana, Dona. Fermina",female,39.0,0.000000,0.000000,PC 17758,4.699571,C105,C,P,C
1306,1.386294,"Saether, Mr. Simon Sivertsen",male,38.5,0.000000,0.000000,SOTON/O.Q. 3101262,2.110213,,S,S,
1307,1.386294,"Ware, Mr. Frederick",male,25.4,0.000000,0.000000,359309,2.202765,,S,3,


In [120]:
data8 = data8.drop(['Name','Ticket','Cabin','ticket_1st_letter'],axis=1)

In [121]:
data8

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_1st_letter
0,1.386294,male,22.0,0.693147,0.000000,2.110213,S,
1,0.693147,female,38.0,0.693147,0.000000,4.280593,C,C
2,1.386294,female,26.0,0.000000,0.000000,2.188856,S,
3,0.693147,female,35.0,0.693147,0.000000,3.990834,S,C
4,1.386294,male,35.0,0.000000,0.000000,2.202765,S,
...,...,...,...,...,...,...,...,...
1304,1.386294,male,25.4,0.000000,0.000000,2.202765,S,
1305,0.693147,female,39.0,0.000000,0.000000,4.699571,C,C
1306,1.386294,male,38.5,0.000000,0.000000,2.110213,S,
1307,1.386294,male,25.4,0.000000,0.000000,2.202765,S,


In [122]:
data9 = data8.copy()
data9 = pd.get_dummies(data8)
data9

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,cabin_1st_letter_A,cabin_1st_letter_B,cabin_1st_letter_C,cabin_1st_letter_D,cabin_1st_letter_E,cabin_1st_letter_F,cabin_1st_letter_G,cabin_1st_letter_T
0,1.386294,22.0,0.693147,0.000000,2.110213,0,1,0,0,1,0,0,0,0,0,0,0,0
1,0.693147,38.0,0.693147,0.000000,4.280593,1,0,1,0,0,0,0,1,0,0,0,0,0
2,1.386294,26.0,0.000000,0.000000,2.188856,1,0,0,0,1,0,0,0,0,0,0,0,0
3,0.693147,35.0,0.693147,0.000000,3.990834,1,0,0,0,1,0,0,1,0,0,0,0,0
4,1.386294,35.0,0.000000,0.000000,2.202765,0,1,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1.386294,25.4,0.000000,0.000000,2.202765,0,1,0,0,1,0,0,0,0,0,0,0,0
1305,0.693147,39.0,0.000000,0.000000,4.699571,1,0,1,0,0,0,0,1,0,0,0,0,0
1306,1.386294,38.5,0.000000,0.000000,2.110213,0,1,0,0,1,0,0,0,0,0,0,0,0
1307,1.386294,25.4,0.000000,0.000000,2.202765,0,1,0,0,1,0,0,0,0,0,0,0,0


In [123]:
scaler.fit(data9)

data9 = pd.DataFrame(scaler.transform(data9), index=data9.index, columns=data9.columns)

In [124]:
data10 = data9.copy()

In [125]:
train_final2 = data10.loc[:raw_train.index.max(),:].copy()
test_final2 = data10.loc[raw_train.index.max()+1:,:].reset_index(drop=True).copy()

In [126]:
grid2 = GridSearchCV(models['eclf3'], params)
grid2.fit(train_final2, target)
print('\n')
print(f'The best params is : {grid.best_params_}')



The best params is : {'voting': 'hard', 'weights': (2, 1, 1)}


In [127]:
eclf3.fit(train_final2, target)
final_predictions3 = (
    eclf3.predict(test_final2)
)
final_predictions3

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [128]:
exportSolution(final_predictions3,'results_titanic_002.csv')