In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
#Load data
df = pd.read_csv('classification.csv')

In [3]:
#Encode

df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df['Embarked'] =  LabelEncoder().fit_transform(df['Embarked'])

In [4]:
#Useless columns

df.drop(['Name'], axis=1, inplace=True)
df.drop(['Ticket'], axis=1, inplace=True)
df.drop(['PassengerId'], axis=1, inplace=True)

In [5]:
for col in df.columns:
  median = df[col].median()
  df[col] = df[col].fillna(median)

In [6]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survival
0,3.0,1,22.0,1,0,7.2500,2,0
1,1.0,0,38.0,1,0,71.2833,0,1
2,3.0,0,26.0,0,0,7.9250,2,1
3,1.0,0,35.0,1,0,53.1000,2,1
4,3.0,1,35.0,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...
885,2.0,1,27.0,0,0,13.0000,2,0
886,1.0,0,19.0,0,0,30.0000,2,1
887,3.0,0,28.0,1,2,23.4500,2,0
888,1.0,1,26.0,0,0,30.0000,0,1


In [7]:
Y = df['Survival']
X = df.drop('Survival', axis=1)

In [8]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=240, shuffle=True)

In [10]:
lregression = LogisticRegression()
rforest = RandomForestClassifier()
svm = SVC()

In [17]:
lregression.fit(X_train, y_train)
y_pred = lregression.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83       148
           1       0.72      0.76      0.74        92

    accuracy                           0.80       240
   macro avg       0.78      0.79      0.79       240
weighted avg       0.80      0.80      0.80       240



In [18]:
rforest.fit(X_train, y_train)
y_pred = rforest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.82      0.85       148
           1       0.74      0.80      0.77        92

    accuracy                           0.82       240
   macro avg       0.81      0.81      0.81       240
weighted avg       0.82      0.82      0.82       240



In [19]:
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86       148
           1       0.84      0.66      0.74        92

    accuracy                           0.82       240
   macro avg       0.82      0.79      0.80       240
weighted avg       0.82      0.82      0.82       240



In [14]:
p = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

search = GridSearchCV(rforest, param_grid = p, verbose=0, cv=5)
search.fit(X_train, y_train)

In [15]:
y_pred = rforest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.80      0.83       148
           1       0.71      0.82      0.76        92

    accuracy                           0.80       240
   macro avg       0.79      0.81      0.80       240
weighted avg       0.81      0.80      0.81       240

