In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [53]:
train=pd.read_csv('titanic.csv')
test=pd.read_csv('test.csv')

In [54]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [55]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [56]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [57]:
train['Ticket'].value_counts().count()

681

In [58]:
train.tail(100)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
791,792,0,2,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0000,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.5500,,S
793,794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C
794,795,0,3,"Dantcheff, Mr. Ristiu",male,25.0,0,0,349203,7.8958,,S
795,796,0,2,"Otter, Mr. Richard",male,39.0,0,0,28213,13.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [59]:
train.groupby(['Ticket'])['Cabin'].value_counts()

Ticket             Cabin
110152             B77      2
                   B79      1
110413             E67      2
                   E68      1
110465             A14      1
                           ..
SC/AH Basle 541    D        1
SC/Paris 2163      D        1
SOTON/O.Q. 392078  E10      1
W.E.P. 5734        E31      1
WE/P 5735          B22      2
Name: Cabin, Length: 161, dtype: int64

In [60]:
train.groupby(['Cabin'])['Ticket'].value_counts()

Cabin  Ticket  
A10    13049       1
A14    110465      1
A16    11755       1
A19    113056      1
A20    PC 17485    1
                  ..
F38    383121      1
F4     230136      2
G6     347054      2
       PP 9549     2
T      113784      1
Name: Ticket, Length: 161, dtype: int64

In [61]:
train.shape

(891, 12)

In [62]:
test.shape

(418, 11)

In [63]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [64]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [65]:
y=train['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [66]:
PassengerId=test['PassengerId']
PassengerId

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [67]:
train.drop(columns=['Survived'],inplace=True)

In [68]:
train.shape

(891, 11)

In [69]:
test.shape

(418, 11)

In [70]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [71]:
def pre_process(df):
    
    processed_df = df
        
    ########## Deal with missing values ##########
    
    # As we saw before, the two missing values for embarked columns can be replaced by 'C' (Cherbourg)
    processed_df['Embarked'].fillna('C', inplace=True)
    
    # We replace missing ages by the mean age of passengers who belong to the same group of class/sex/family
    processed_df['Age'] = processed_df.groupby(['Pclass','Sex','Parch','SibSp'])['Age'].transform(lambda x: x.fillna(x.mean()))
    processed_df['Age'] = processed_df.groupby(['Pclass','Sex','Parch'])['Age'].transform(lambda x: x.fillna(x.mean()))
    processed_df['Age'] = processed_df.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.mean()))
    
    # We replace the only missing fare value for test dataset and the missing values of the cabin column
    processed_df['Fare'] = processed_df['Fare'].interpolate()
    processed_df['Cabin'].fillna('U', inplace=True)
    
    ########## Feature engineering on columns ##########
    
    # Create a Title column from name column
    processed_df['Title'] = pd.Series((name.split('.')[0].split(',')[1].strip() for name in train_df_raw['Name']), index=processed_df.index)
    processed_df['Title'] = processed_df['Title'].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    processed_df['Title'] = processed_df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    processed_df['Title'] = processed_df['Title'].replace('Mme', 'Mrs')
    processed_df['Title'] = processed_df['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5})
    
    # Filling Age missing values with mean age of passengers who have the same title
    processed_df['Age'] = processed_df.groupby(['Title'])['Age'].transform(lambda x: x.fillna(x.mean()))

    # Transform categorical variables to numeric variables
    processed_df['Sex'] = processed_df['Sex'].map({'male': 0, 'female': 1})
    processed_df['Embarked'] = processed_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # Create a Family Size, Is Alone, Child and Mother columns
    processed_df['FamillySize'] = processed_df['SibSp'] + processed_df['Parch'] + 1
    processed_df['FamillySize'][processed_df['FamillySize'].between(1, 5, inclusive=False)] = 2
    processed_df['FamillySize'][processed_df['FamillySize']>5] = 3
    processed_df['IsAlone'] = np.where(processed_df['FamillySize']!=1, 0, 1)
    processed_df['IsChild'] = processed_df['Age'] < 18
    processed_df['IsChild'] = processed_df['IsChild'].astype(int)
    
    
    
    # Modification of cabin column to keep only the letter contained corresponding to the deck of the boat
    processed_df['Cabin'] = processed_df['Cabin'].str[:1]
    processed_df['Cabin'] = processed_df['Cabin'].map({cabin: p for p, cabin in enumerate(set(cab for cab in processed_df['Cabin']))})
    
    
    # These two columns are not useful anymore
    processed_df = processed_df.drop(['Name', 'Ticket', 'PassengerId'], 1)    
    
    return processed_df


In [72]:
train_df_raw=train
train=pre_process(train)
train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,FamillySize,IsAlone,IsChild
0,3,0,22.000000,1,0,7.2500,2,0,1,2,0,0
1,1,1,38.000000,1,0,71.2833,7,1,3,2,0,0
2,3,1,26.000000,0,0,7.9250,2,0,2,1,1,0
3,1,1,35.000000,1,0,53.1000,7,0,3,2,0,0
4,3,0,35.000000,0,0,8.0500,2,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.000000,0,0,13.0000,2,0,5,1,1,0
887,1,1,19.000000,0,0,30.0000,4,0,2,1,1,0
888,3,1,14.346939,1,2,23.4500,2,0,2,2,0,1
889,1,0,26.000000,0,0,30.0000,7,1,1,1,1,0


In [73]:
train_df_raw=test
test=pre_process(test)
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,FamillySize,IsAlone,IsChild
0,3,0,34.500000,0,0,7.8292,2,2,1,1,1,0
1,3,1,47.000000,1,0,7.0000,2,0,3,2,0,0
2,2,0,62.000000,0,0,9.6875,2,2,1,1,1,0
3,3,0,27.000000,0,0,8.6625,2,0,1,1,1,0
4,3,1,22.000000,1,1,12.2875,2,0,3,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,26.594203,0,0,8.0500,2,0,1,1,1,0
414,1,1,39.000000,0,0,108.9000,6,1,5,1,1,0
415,3,0,38.500000,0,0,7.2500,2,0,1,1,1,0
416,3,0,26.594203,0,0,8.0500,2,0,1,1,1,0


In [74]:
final=train.append(test)

In [75]:
final.head(892)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,FamillySize,IsAlone,IsChild
0,3,0,22.000000,1,0,7.2500,2,0,1,2,0,0
1,1,1,38.000000,1,0,71.2833,7,1,3,2,0,0
2,3,1,26.000000,0,0,7.9250,2,0,2,1,1,0
3,1,1,35.000000,1,0,53.1000,7,0,3,2,0,0
4,3,0,35.000000,0,0,8.0500,2,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
887,1,1,19.000000,0,0,30.0000,4,0,2,1,1,0
888,3,1,14.346939,1,2,23.4500,2,0,2,2,0,1
889,1,0,26.000000,0,0,30.0000,7,1,1,1,1,0
890,3,0,32.000000,0,0,7.7500,2,2,1,1,1,0


In [76]:
final=pd.get_dummies(final, columns=['Pclass','Sex','Embarked','FamillySize','IsAlone','IsChild'],drop_first=True)

In [77]:
final

Unnamed: 0,Age,SibSp,Parch,Fare,Cabin,Title,Pclass_2,Pclass_3,Sex_1,Embarked_1,Embarked_2,FamillySize_2,FamillySize_3,FamillySize_5,IsAlone_1,IsChild_1
0,22.000000,1,0,7.2500,2,1,0,1,0,0,0,1,0,0,0,0
1,38.000000,1,0,71.2833,7,3,0,0,1,1,0,1,0,0,0,0
2,26.000000,0,0,7.9250,2,2,0,1,1,0,0,0,0,0,1,0
3,35.000000,1,0,53.1000,7,3,0,0,1,0,0,1,0,0,0,0
4,35.000000,0,0,8.0500,2,1,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,26.594203,0,0,8.0500,2,1,0,1,0,0,0,0,0,0,1,0
414,39.000000,0,0,108.9000,6,5,0,0,1,1,0,0,0,0,1,0
415,38.500000,0,0,7.2500,2,1,0,1,0,0,0,0,0,0,1,0
416,26.594203,0,0,8.0500,2,1,0,1,0,0,0,0,0,0,1,0


In [78]:
X=final.head(891).values

In [79]:
X.shape

(891, 16)

In [80]:
y.shape

(891,)

In [81]:
Xf=final.tail(418).values

In [82]:
Xf.shape

(418, 16)

In [83]:
891+418

1309

In [84]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [85]:
X_train.shape

(712, 16)

In [86]:
y_train.shape

(712,)

In [87]:
X_test.shape

(179, 16)

In [88]:
y_test.shape

(179,)

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn. ensemble import VotingClassifier

In [90]:
lr = LogisticRegression(solver='lbfgs', random_state=42)
svm = SVC(kernel = 'poly', degree = 2, random_state=42 )
rf=RandomForestClassifier(n_estimators=300, max_depth=11, random_state=42)

In [91]:
evc = VotingClassifier( estimators= [('lr',lr),('rf',rf),('svm',svm)], voting = 'hard')

In [92]:
evc.fit(X_train,y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                            

In [93]:
evc.score(X_test, y_test)

0.8435754189944135

In [94]:
y_pred_evc=evc.predict(X_test)

In [95]:
accuracy_score(y_test,y_pred_evc)

0.8435754189944135

In [96]:
yf_evc=evc.predict(Xf)
yf_evc

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [97]:
submission=pd.DataFrame()

In [98]:
submission['PassengerId']=PassengerId
submission['Survived']=yf_evc

In [99]:
submission.to_csv('submission_ensemble.csv',index=False)

In [100]:
submission_evc=pd.read_csv('submission_ensemble.csv')

In [101]:
submission_evc

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
