In [153]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [155]:
df = pd.read_csv('titanic.csv')

In [157]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [159]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [161]:
df.duplicated().sum()

0

In [163]:
# Data Preprocessing
# 1. Combine SibSp and Parch into family
# 2. Drop Columns
# 3. Encoding
# 4. Scaling

In [165]:
df['Family'] = df['SibSp'].values + df['Parch'].values

In [167]:
df.shape

(418, 13)

In [169]:
df = df.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Fare'], axis = 1)

In [171]:
df.shape

(418, 8)

In [173]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Family
35,0,3,male,18.5,0,0,C,0
164,0,2,male,41.0,0,0,S,0
128,0,2,male,42.0,0,0,S,0
68,0,1,male,31.0,0,0,C,0
305,1,1,female,64.0,1,1,S,2


In [175]:
# Encoding

In [177]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first') 

encoded_embarked = encoder.fit_transform(df[['Embarked']])

encoded_df = pd.DataFrame(encoded_embarked.toarray(), columns=encoder.get_feature_names_out(['Embarked']))

encoded_df = encoded_df.astype(int)

df_encoded_embarked = pd.concat([df, encoded_df], axis = 1)

In [179]:
df_encoded_embarked.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Family,Embarked_Q,Embarked_S
0,0,3,male,34.5,0,0,Q,0,1,0
1,1,3,female,47.0,1,0,S,1,0,1
2,0,2,male,62.0,0,0,Q,0,1,0
3,0,3,male,27.0,0,0,S,0,0,1
4,1,3,female,22.0,1,1,S,2,0,1


In [181]:
encoded_sex = encoder.fit_transform(df_encoded_embarked[['Sex']])

encoded_sex = pd.DataFrame(encoded_sex.toarray(), columns=encoder.get_feature_names_out(['Sex']))

encoded_sex = encoded_sex.astype(int)

df_encoded_sex = pd.concat([df_encoded_embarked, encoded_sex], axis = 1)

In [183]:
df_encoded_sex = df_encoded_sex.drop(columns=['Sex', 'Embarked', 'SibSp', 'Parch'])

In [185]:
df_encoded_sex.head()

Unnamed: 0,Survived,Pclass,Age,Family,Embarked_Q,Embarked_S,Sex_male
0,0,3,34.5,0,1,0,1
1,1,3,47.0,1,0,1,0
2,0,2,62.0,0,1,0,1
3,0,3,27.0,0,0,1,1
4,1,3,22.0,2,0,1,0


In [187]:
# Scaling

In [189]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

df_encoded_sex['Age'] = sc.fit_transform(df_encoded_sex[['Age']])

In [191]:
df_encoded_sex.head()

Unnamed: 0,Survived,Pclass,Age,Family,Embarked_Q,Embarked_S,Sex_male
0,0,3,0.298549,0,1,0,1
1,1,3,1.181328,1,0,1,0
2,0,2,2.240662,0,1,0,1
3,0,3,-0.231118,0,0,1,1
4,1,3,-0.584229,2,0,1,0


In [193]:
df_new = df_encoded_sex

In [194]:
df_new.head()

Unnamed: 0,Survived,Pclass,Age,Family,Embarked_Q,Embarked_S,Sex_male
0,0,3,0.298549,0,1,0,1
1,1,3,1.181328,1,0,1,0
2,0,2,2.240662,0,1,0,1
3,0,3,-0.231118,0,0,1,1
4,1,3,-0.584229,2,0,1,0


In [197]:
df_new.isnull().sum()

Survived       0
Pclass         0
Age           86
Family         0
Embarked_Q     0
Embarked_S     0
Sex_male       0
dtype: int64

In [199]:
from sklearn.impute import SimpleImputer

si = SimpleImputer(strategy='mean')

df_new['Age'] = si.fit_transform(df_new[['Age']])

In [201]:
df_new.isnull().sum()

Survived      0
Pclass        0
Age           0
Family        0
Embarked_Q    0
Embarked_S    0
Sex_male      0
dtype: int64

In [203]:
from sklearn.model_selection import train_test_split

X = df_new.iloc[:, 1:]
y = df_new.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Model Training

### 1. Logistic Regression

In [207]:
from sklearn.linear_model import LogisticRegression
lrg = LogisticRegression()

lrg.fit(X_train, y_train)

y_pred_lrg = lrg.predict(X_test)

In [208]:
y_pred_lrg

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

### 2. k-Nearest Neighbor

In [212]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

In [213]:
y_pred_knn

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

### 3. Support Vector Machine

In [217]:
from sklearn.svm import SVC
sv = SVC(kernel='linear')

sv.fit(X_train, y_train)

y_pred_sv = sv.predict(X_test)

In [219]:
y_pred_sv

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

### 4. Random Forest

In [222]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=1)

rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)

In [223]:
y_pred_rfc

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

## Model Evaluation

In [225]:
from sklearn.metrics import accuracy_score

lrg_accuracy = accuracy_score(y_test, y_pred_lrg)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
sv_accuracy = accuracy_score(y_test, y_pred_sv)
rfc_accuracy = accuracy_score(y_test, y_pred_rfc)

In [226]:
print("Logistic Regression: ", lrg_accuracy)
print("kNN: ", knn_accuracy)
print("SVM: ", sv_accuracy)
print("Random Forest: ", rfc_accuracy)

Logistic Regression:  1.0
kNN:  0.9404761904761905
SVM:  1.0
Random Forest:  1.0


In [228]:
from sklearn.metrics import confusion_matrix

In [229]:
cm_lrg = confusion_matrix(y_test, y_pred_lrg) 
cm_lrg

array([[58,  0],
       [ 0, 26]], dtype=int64)

In [235]:
cm_knn = confusion_matrix(y_test, y_pred_knn) 
cm_knn

array([[56,  2],
       [ 3, 23]], dtype=int64)

In [236]:
cm_sv = confusion_matrix(y_test, y_pred_sv) 
cm_sv

array([[58,  0],
       [ 0, 26]], dtype=int64)

In [237]:
cm_rfc = confusion_matrix(y_test, y_pred_rfc) 
cm_rfc

array([[58,  0],
       [ 0, 26]], dtype=int64)

In [248]:
import pickle

pickle.dump(lrg, open("lrg_model.pkl", "wb"))
pickle.dump(knn, open("knn_model.pkl", "wb"))
pickle.dump(sv, open("svm_model.pkl", "wb"))
pickle.dump(rfc, open("rfc_model.pkl", "wb"))