# Titanic dataset
## Introduction

Dataset yang digunakan merupakan data yang terkumpul dari penumpang kapal Titanic yang tenggelam pada tahun 1912 karena bertabrakan dengan gunung es. Ada beberapa variabel yang perlu dijelaskan dalam dataset tersebut:

##### Survived
Kondisi penumpang tersebut pasca-tabrakan (0=mati, 1=hidup)
##### Pclass
Kelas dari penumpang tersebut
##### SibSp
Jumlah saudara atau pasangan di kapal
##### Parch
Jumlah orang tua atau anak di kapal
##### Embarked
Pelabuhan penumpang tersebut naik (Cherbourg, Queenstown, Southampton)

* Name, Sex, Age, Ticket, dan Fare sudah intuitif

Tujuan dari model ini adalah menggunakan machine learning untuk mencari pola dari penumpang yang berhasil bertahan hidup setelah kapal karam dengan cara classification menggunakan Logistic Regression.

In [None]:
# imports
# essentials
import pandas as pd
import numpy as np
from matplotlib.pyplot import subplots
import seaborn as sns
# models
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# dataset split
from sklearn.model_selection import train_test_split

## Data cleaning and preprocessing

In [None]:
# pengecekan data
data = pd.read_csv('titanic.csv')
data.info()

In [None]:
data.describe()

In [None]:
# banyak data yang kosong di Age dan Cabin
# ada 2 penumpang tanpa pelabuhan yang tercatat
# sekitar 38.4% penumpang bertahan hidup
# umur penumpang berada diantara 0.4 dan 80
# Sex, Ticket, Cabin, dan Embarked sebaiknya diubah menjadi value numerik supaya dapat dipakai

In [None]:
data.head(10)

### Age cleaning

In [None]:
data['Age'].describe()

In [None]:
# mencari outlier
sns.boxplot(data['Age'])

In [None]:
data.loc[np.where(data['Age']>65)]['Age']

In [None]:
# mencari distribusi
sns.histplot(data['Age'])

In [None]:
# karena terdapat beberapa outlier dan data agak miring kita gunakan median untuk mengisi kekosongan
median = data['Age'].median()
data['Age'] = data['Age'].fillna(median).astype(int)
data['Age'].isna().sum()

### Embarked cleaning

In [None]:
# Hanya ada 2 value yang hilang jadi bisa diisi dengan data yang paling banyak muncul
data['Embarked'].describe()

In [None]:
# paling banyak adalah S
data['Embarked'] = data['Embarked'].fillna('S')
data['Embarked'].isna().sum()

### Cabin cleaning

In [None]:
data['Cabin'].describe()

In [None]:
# 204 dari 891 ada isinya
# terlalu sedikit, lebih baik di drop saja
data = data.drop(columns=['Cabin'])
data.info()

### Preprocessing lainnya
- Beberapa variabel seperti Sex dan Embarked sebaiknya diubah menjadi nilai numerik
- PassengerId, Name, dan Ticket tidak penting maka dapat didrop
- Fare dan Age dapat dikategorikan
- SibSp dan Parch dapat digabungkan menjadi Fam (jumlah anggota keluarga)

In [None]:
# mengubah Sex
sex = {'male':0, 'female':1}
data['Sex'] = data['Sex'].map(sex)
data['Sex'].describe()

In [None]:
# mengubah Embarked
emb = {'S':0, 'C':1, 'Q':2}
data['Embarked'] = data['Embarked'].map(emb)
data['Embarked'].describe()

In [None]:
data = data.drop(columns=['PassengerId','Name', 'Ticket'])
data.info()

In [None]:
# pengategorian Age
data.loc[ data['Age']<=10, 'Age'] = 0
data.loc[(data['Age']>10) & (data['Age']<=18), 'Age'] = 1
data.loc[(data['Age']>18) & (data['Age']<=27), 'Age'] = 2
data.loc[(data['Age']>27) & (data['Age']<=35), 'Age'] = 3
data.loc[(data['Age']>35) & (data['Age']<=50), 'Age'] = 4
data.loc[ data['Age']>50, 'Age'] = 5
data['Age'].value_counts()

In [None]:
# pengategorian Fare
data.loc[ data['Fare']<=8, 'Fare'] = 0
data.loc[(data['Fare']>8) & (data['Fare']<=12), 'Fare'] = 1
data.loc[(data['Fare']>12) & (data['Fare']<=25), 'Fare'] = 2
data.loc[(data['Fare']>25) & (data['Fare']<=50), 'Fare'] = 3
data.loc[(data['Fare']>50) & (data['Fare']<=100), 'Fare'] = 4
data.loc[ data['Fare']>100, 'Fare'] = 5
data['Fare'] = data['Fare'].astype(int)
data['Fare'].value_counts()

In [None]:
# Penggabungan SibSp dan Parch
data['Fam'] = data['SibSp']+data['Parch']
data = data.drop(columns=['SibSp','Parch'])
data['Fam'].describe()

In [None]:
# hasil setelah preprocessing
data.head()

## Algoritma Machine Learning

In [None]:
# split dataset untuk testing dan training
X = data.loc[:, ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Fam']]
y = data.loc[:, ['Survived']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, shuffle=True)

### Statsmodels Logistic Regression

In [None]:
logreg = sm.Logit(y_train, X_train).fit()

In [None]:
logreg.summary()

In [None]:
logreg.params

In [None]:
logreg.pvalues

In [None]:
np.sqrt(np.diag(logreg.cov_params()))

### SciKit Models

In [None]:
logReg = LogisticRegression()
logReg.fit(X_train, y_train.values.ravel())
y_pred = logReg.predict(X_test)
acc_sco = round(accuracy_score(y_test, y_pred)*100, 2)
acc_log = round(logReg.score(X_train, y_train)*100, 2)
print(acc_sco,'%')
print(acc_log,'%')
logReg.coef_

In [None]:
lda = LDA()
lda.fit(X_train, y_train.values.ravel())
y_pred = lda.predict(X_test)
acc_sco = round(accuracy_score(y_test, y_pred)*100, 2)
acc_log = round(lda.score(X_train, y_train)*100, 2)
print(acc_sco,'%')
print(acc_log,'%')

In [None]:
qda = QDA()
qda.fit(X_train, y_train.values.ravel())
y_pred = qda.predict(X_test)
acc_sco = round(accuracy_score(y_test, y_pred)*100, 2)
acc_log = round(qda.score(X_train, y_train)*100, 2)
print(acc_sco,'%')
print(acc_log,'%')

In [None]:
naBa = GaussianNB()
naBa.fit(X_train, y_train.values.ravel())
y_pred = naBa.predict(X_test)
acc_sco = round(accuracy_score(y_test, y_pred)*100, 2)
acc_log = round(naBa.score(X_train, y_train)*100, 2)
print(acc_sco,'%')
print(acc_log,'%')