In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette("YlGnBu")

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_gender_submission = pd.read_csv('gender_submission.csv')

df_train['train_test'] = 1
df_test['train_test'] = 0
df_test['Survived'] = np.NaN

df_all = pd.concat([df_train, df_test])

df_all.columns

In [None]:
# train set

df_train

In [None]:
# test set

df_test

In [None]:
# sample submission

df_gender_submission

# Exploratory Data Analysis

In [None]:
# train data

train_columns = []

train_columns.extend(df_train.columns)

print(f'Loaded train dataset with shape {df_train.shape} ({df_train.shape[0]} rows and {df_train.shape[1]} columns) and column names: \n{train_columns}')

test_columns = []

test_columns.extend(df_test.columns)

print(f'\nLoaded test dataset with shape {df_test.shape} ({df_test.shape[0]} rows and {df_test.shape[1]} columns) and column names: \n{test_columns}')

gender_submission_columns = []

gender_submission_columns.extend(df_gender_submission.columns)

print(f'\nLoaded sample submission dataset with shape {df_gender_submission.shape} ({df_gender_submission.shape[0]} rows and {df_gender_submission.shape[1]} columns) and column names: \n{gender_submission_columns}')

## Train dataset

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

## Survival ratio

In [None]:
df_survived = df_train['Survived']

print(f'There were {df_survived.value_counts()[1]} survivors and {df_survived.value_counts()[0]} mortalities in the train set.')
print(f'Making the chance of survival {df_survived.value_counts()[1] / (df_survived.value_counts()[1] + df_survived.value_counts()[0]) * 100}%')

sns.countplot(x = df_survived)
plt.title('Distribution of survival or mortality')
plt.show()

## Name

In [None]:
# Extract titles

df_train['Title'] = df_train['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]

#df_train['Title'] = df_train['Title'].str.split('.', expand=True)[0]

#df_train['Title'] = df_train['Title'].astype('string')

# List most frequent titles

titles = df_train['Title'].value_counts()[0:6]

print(titles)

for title in ['Mr']:
    print(f"Survival ratio for {title}: {df_train[df_train['Title'] == title]['Survived'].mean()}")

In [None]:
(df_train['Title'].value_counts()
                .to_frame()
                .reset_index()
                .iloc[:6]
                .rename(columns={'index':'Title', 'Title':'Frequency'}))

## Ticket Class

In [None]:
df_pclass = df_train['Pclass']

print('Passengers were split into three Ticket Classes and hereby the placement on the ship deck:')
print(f'There were {df_pclass.value_counts().sort_index()[1]} people on the upper deck.') 
print(f'There were {df_pclass.value_counts().sort_index()[2]} people on the middle deck.') 
print(f'There were {df_pclass.value_counts().sort_index()[3]} people on the lower deck')

sns.countplot(x = df_pclass)
plt.title('Distribution of ticket classes')
plt.show()

## Passenger sex

In [None]:
df_sex = df_train['Sex']
print(f'There were {df_sex.value_counts().sort_index()[1]} males aboard.') 
print(f'There were {df_sex.value_counts().sort_index()[0]} females aboard.') 

sns.countplot(x = df_sex)
plt.title('Distribution of passenger sex')
plt.show()

## Passenger age

In [None]:
df_age = df_train['Age']
print(f'There were {np.count_nonzero(df_age < 25)} passenges under the age of 25.') 
print(f'There were {np.count_nonzero((df_age >= 25) & (df_age <= 65))} passengers between the age of 25 and 65.') 
print(f'There were {np.count_nonzero(df_age > 65)} passenges older than 65.') 


sns.histplot(data = df_age)
plt.title('Distribution of passenger age')
plt.show()

## Number of siblings/spouses

In [None]:
df_sibsp = df_train['SibSp']
print(f'There were {df_sibsp.value_counts().sort_index()[0]} passengers with no siblings or spouses.')

sns.countplot(x = df_sibsp)
plt.title('Distribution of number of siblings/spouses aboard')
plt.show()

## Number of parents/children

In [None]:
df_parch = df_train['Parch']
print(f'There were {df_parch.value_counts().sort_index()[0]} passengers with no parents or children.')

sns.countplot(x = df_parch)
plt.title('Distribution of number of parents/children aboard')
plt.show()

## Tickets

In [None]:
df_ticket = df_train['Ticket']
#print(f'There were {np.count_nonzero(df_fare < 10)} passengers payed less than 10 dollars for their ticket.') 
#print(f'There were {np.count_nonzero((df_fare >= 10) & (df_fare <= 50))} passengers payed between 10 and 50 dollars for their ticket.') 
#print(f'There were {np.count_nonzero(df_fare > 50)} passengers payed more than 50 dollars for their ticket.') 


sns.histplot(data = df_ticket.value_counts())
plt.title('Distribution of people per ticket')
plt.show()

## Fare

In [None]:
df_fare = df_train['Fare']
print(f'There were {np.count_nonzero(df_fare < 10)} passengers payed less than 10 dollars for their ticket.') 
print(f'There were {np.count_nonzero((df_fare >= 10) & (df_fare <= 50))} passengers payed between 10 and 50 dollars for their ticket.') 
print(f'There were {np.count_nonzero(df_fare > 50)} passengers payed more than 50 dollars for their ticket.') 


sns.histplot(data = df_fare)
plt.title('Distribution of fares')
plt.show()

## Cabin

In [None]:
df_cabin = df_train['Cabin']
df_cabin = df_cabin.apply(lambda x: 1 if not pd.isnull(x) else 0)
print(f'There were {df_cabin.value_counts().sort_index()[1]} passengers who had a cabin.') 
print(f'There were {df_cabin.value_counts().sort_index()[0]} passengers who did not have a cabin.') 

sns.countplot(x = df_cabin)
plt.title('Distribution of number of passengers with a cabin')
plt.show()

## Port of Embarkation

In [None]:
df_port = df_train['Embarked']
print(f'There were {df_port.value_counts().sort_index()[2]} passengers boarding the ship at Southampton.') 
print(f'There were {df_port.value_counts().sort_index()[0]} passengers boarding the ship at Cherbourg.') 
print(f'There were {df_port.value_counts().sort_index()[1]} passengers boarding the ship at Queenstown.') 

sns.countplot(x = df_port)
plt.title('Distribution of number of passengers with a cabin')
plt.show()

## Survival rate factors

In [None]:
sns.catplot(data=df_train, x="Sex", y="Survived", hue="Pclass", kind="bar")
plt.title('Survival rate based on sex and passanger class')
plt.show()

# Feature selection

In [None]:
# change cabin names and numbers to cabin yes or no

df_train['Cabin'] = df_train['Cabin'].apply(lambda x: 1 if not pd.isnull(x) else 0)

# change male/female to 0 and 1

df_train.loc[df_train['Sex'] == 'male', 'Sex'] = 0
df_train.loc[df_train['Sex'] == 'female', 'Sex'] = 1

# One-Hot encode Embarkation (done with pd.get_dummies() further down)

df_train.loc[df_train['Embarked'] == 'S', 'embarked_Southampton'] = 1
df_train.loc[df_train['Embarked'] == 'C', 'embarked_Cherbough'] = 1
df_train.loc[df_train['Embarked'] == 'Q', 'embarked_Queenstown'] = 1

df_train = df_train.drop('Embarked', axis = 1)

df_train = df_train.replace(np.nan, 0)

In [None]:
# Corelation matrix of numerical categories
(df_train[[
    'PassengerId', 
    'Survived', 
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Cabin',
    'Pclass',
    'embarked_Southampton',
    'embarked_Cherbough',
    'embarked_Queenstown']].corr())

In [None]:
# Heatmap of correlation matrix for training data columns

fig, ax = plt.subplots(figsize=(12,8)) 

sns.heatmap((df_train[[
            'PassengerId', 
            'Survived', 
            'Age',
            'SibSp',
            'Parch',
            'Fare',
            'Cabin',
            'Pclass',
            'embarked_Southampton',
            'embarked_Cherbough',
            'embarked_Queenstown'
            ]].corr()),
            linewidths=1,
            cmap=plt.cm.Blues, 
            annot=True,
            ax=ax)

plt.title('Heatmap for correlation between columns of training data')

# Final Processing

In [None]:
df_all.head()

In [None]:
# drop null the two null Embarked values
df_all.dropna(subset=['Embarked'],inplace = True)

# change cabin names and numbers to cabin yes or no

df_all['Cabin'] = df_all['Cabin'].apply(lambda x: 1 if not pd.isnull(x) else 0)

# change male/female to 0 and 1

df_all.loc[df_all['Sex'] == 'male', 'Sex'] = 0
df_all.loc[df_all['Sex'] == 'female', 'Sex'] = 1

# extract titles

df_all['Title'] = df_all['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]

df_all = (df_all.drop([
                    'PassengerId',
                    'Name',
                    'Title',
                    'Ticket',
                    'Name'],
                    axis = 1
                    ))

df_all['Pclass'] = df_all['Pclass'].astype(str)

df_all_dummies = pd.get_dummies(df_all[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'train_test']])

X_train = df_all_dummies[df_all_dummies.train_test == 1].drop(['train_test'], axis =1)
X_test = df_all_dummies[df_all_dummies.train_test == 0].drop(['train_test'], axis =1)

y_train = df_all[df_all['train_test'] == 1]['Survived']

print(f'{X_train.shape}, {X_test.shape}, {y_train.shape}')

# Model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

## Naive Bayes

In [None]:
gnb = GaussianNB()

cv = cross_val_score(gnb, 
                     X_train, 
                     y_train, 
                     cv=5)

print(cv)
print(cv.mean())

In [None]:
lr = LogisticRegression(max_iter=2000)

cv = cross_val_score(lr, 
                     X_train, 
                     y_train, 
                     cv=5)

print(cv)
print(cv.mean())

In [None]:
dt = tree.DecisionTreeClassifier(random_state=42)

cv = cross_val_score(dt, 
                     X_train, 
                     y_train, 
                     cv=5)

print(cv)
print(cv.mean())

In [None]:
knn = KNeighborsClassifier()

cv = cross_val_score(knn, 
                     X_train, 
                     y_train, 
                     cv=5)

print(cv)
print(cv.mean())

In [None]:
rf = RandomForestClassifier()

cv = cross_val_score(rf, 
                     X_train, 
                     y_train, 
                     cv=5)

print(cv)
print(cv.mean())

In [None]:
svc = SVC(probability=True)

cv = cross_val_score(svc, 
                     X_train, 
                     y_train, 
                     cv=5)

print(cv)
print(cv.mean())

In [None]:
xgb = XGBClassifier(random_state = 42)

cv = cross_val_score(xgb, 
                     X_train, 
                     y_train, 
                     cv=5)

print(cv)
print(cv.mean())

In [None]:
voting_clf = VotingClassifier(estimators=[
                                          ('lr', lr), 
                                          ('knn', knn), 
                                          ('rf', rf), 
                                          ('gnb', gnb), 
                                          ('dt', dt), 
                                          ('svc', svc), 
                                          ('xgb', xgb)],
                                          voting='soft'
                                          )

cv = cross_val_score(voting_clf, X_train, y_train, cv=5)

print(cv)
print(cv.mean())

## Baseline submission of best performing model

In [None]:
voting_clf.fit(X_train, y_train)

y_hat_baseline = voting_clf.predict(X_test).astype(int)

baseline_submission = {'PassengerId': df_test.PassengerId, 'Survived': y_hat_baseline}

#baseline_submission = pd.DataFrame(data=baseline_submission)


In [None]:
y_hat_baseline.size

In [None]:
X_test

# Model performance

|Model|Baseline|Scaled Performance|Scaled and Tuned Performance|
|--|--|--|--|
|Naive Bayes| 77.1%| NA|
|Logistic Regression| 79.3%| 82.6%|
|Decision Tree Classifier| 75.4%| NA|
|KNN Classifier| 71.3%|83.0%|
|Random Forest Classifier| 78.8%| 83.6|
|Support Vector Classifier| 68.9%| 83.2%|
|Xtreme Gradient Boosting| 79.2%| 85.3%|
|Voting Classifier| **80.6%**| 61%|