In [139]:
import pandas as pd

data = pd.read_csv(r'''/home/sagar/GitHub-Repos/Titanic_Wreck/train.csv''')

In [140]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [141]:
data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [142]:
training_data = data.drop(columns=['Cabin', 'PassengerId'])

In [143]:
training_data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Embarked      2
dtype: int64

In [144]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')


In [145]:
# impute column 'Age'
training_data['Age'] = pd.DataFrame(imp_mean.fit_transform(training_data[['Age']]))

In [146]:
training_data.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    2
dtype: int64

In [147]:
# drop rows with na values; column - Embarked
training_data = training_data.dropna()

In [148]:
training_data.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [149]:
training_data.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [153]:
from sklearn.preprocessing import LabelEncoder
training_num = training_data.select_dtypes(include=np.number)
training_str = training_data.select_dtypes(exclude=np.number)
# training_str = pd.get_dummies(training_str)
training_str = training_str.apply(LabelEncoder().fit_transform)
training_data = pd.concat([training_num, training_str], axis=1)

In [154]:
training_data.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Name', 'Sex',
       'Ticket', 'Embarked'],
      dtype='object')

In [155]:
training_str

Unnamed: 0,Name,Sex,Ticket,Embarked
0,108,1,522,2
1,190,0,595,0
2,353,0,668,2
3,272,0,48,2
4,15,1,471,2
...,...,...,...,...
886,547,1,100,2
887,303,0,14,2
888,412,0,674,2
889,81,1,8,0


In [80]:
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import LabelEncoder
# # training_str.apply(LabelEncoder().fit_transform)
# enc = OneHotEncoder()
# enc.fit(training_str)
# pd.DataFrame(enc.transform(training_str), columns=training_str.columns)

In [156]:
training_data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Embarked
0,0,3,22.000000,1,0,7.2500,108,1,522,2
1,1,1,38.000000,1,0,71.2833,190,0,595,0
2,1,3,26.000000,0,0,7.9250,353,0,668,2
3,1,1,35.000000,1,0,53.1000,272,0,48,2
4,0,3,35.000000,0,0,8.0500,15,1,471,2
...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,547,1,100,2
887,1,1,19.000000,0,0,30.0000,303,0,14,2
888,0,3,29.699118,1,2,23.4500,412,0,674,2
889,1,1,26.000000,0,0,30.0000,81,1,8,0


In [157]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(training_data.drop(columns='Survived'), training_data['Survived'],
                                                 test_size = 0.3, shuffle=True)

In [158]:
print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)

(622, 9) (267, 9) (622,) (267,)


### Building Classifiers and performance evaluation

In [159]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

In [185]:
## Fit classifiers and evaluate performance

for name, clf in zip(names, classifiers):
    clf.fit(X_train, Y_train)
    score = clf.score(X_val, Y_val)
    print(name, ' : ', score)

Nearest Neighbors  :  0.602996254681648
Linear SVM  :  0.7677902621722846
RBF SVM  :  0.6404494382022472
Gaussian Process  :  0.6629213483146067
Decision Tree  :  0.7940074906367042
Random Forest  :  0.7827715355805244
Neural Net  :  0.6666666666666666
AdaBoost  :  0.7752808988764045
Naive Bayes  :  0.7565543071161048
QDA  :  0.7940074906367042


In [161]:
model_ada = AdaBoostClassifier().fit(X_train, Y_train)

In [186]:
model_DT = DecisionTreeClassifier(max_depth=5).fit(X_train, Y_train)

In [162]:
model_ada.score(X_val, Y_val)

0.7752808988764045

In [188]:
model_DT.score(X_val, Y_val)

0.7940074906367042

In [163]:
## Test data

testing_data = pd.read_csv(r'''/home/sagar/GitHub-Repos/Titanic_Wreck/test.csv''')

In [164]:
testing_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [165]:
passenger_id = testing_data[['PassengerId']]

In [166]:
testing_data.drop(columns=['Cabin', 'PassengerId'], inplace=True)

In [167]:
testing_data.isnull().sum()

Pclass       0
Name         0
Sex          0
Age         86
SibSp        0
Parch        0
Ticket       0
Fare         1
Embarked     0
dtype: int64

In [168]:
testing_data['Age'] = pd.DataFrame(imp_mean.fit_transform(testing_data[['Age']]))
testing_data['Fare'] = pd.DataFrame(imp_mean.fit_transform(testing_data[['Fare']]))

In [169]:
testing_data.isnull().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [172]:
testing_num = testing_data.select_dtypes(include=np.number)
testing_str = testing_data.select_dtypes(exclude=np.number)
# testing_str = pd.get_dummies(testing_str)
testing_str = testing_str.apply(LabelEncoder().fit_transform)
testing_data = pd.concat([testing_num, testing_str], axis=1)

In [173]:
testing_str

Unnamed: 0,Name,Sex,Ticket,Embarked
0,206,1,152,1
1,403,0,221,2
2,269,1,73,1
3,408,1,147,2
4,178,0,138,2
...,...,...,...,...
413,353,1,267,2
414,283,0,324,0
415,332,1,346,2
416,384,1,220,2


In [174]:
testing_data.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Name', 'Sex', 'Ticket',
       'Embarked'],
      dtype='object')

In [178]:
len(testing_data)

418

In [195]:
preds_ada = pd.DataFrame(model_ada.predict(testing_data), columns=['Survived'])

In [196]:
pd.concat([passenger_id, preds_ada], axis=1)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [199]:
# pd.concat([passenger_id, preds_ada], axis=1).to_csv(r'''/home/sagar/GitHub-Repos/Titanic_Wreck/submission_ada.csv''')

In [197]:
preds_DT = pd.DataFrame(model_DT.predict(testing_data), columns=['Survived'])
pd.concat([passenger_id, preds_DT], axis=1)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [200]:
# pd.concat([passenger_id, preds_DT], axis=1).to_csv(r'''/home/sagar/GitHub-Repos/Titanic_Wreck/submission_DT.csv''')