#### Unsupervised Machine Learning
* Classification

##### Objective
* Predict the survived persons on the Titanic Ship Wreck

In [139]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [140]:
# read data
df = pd.read_csv("../DATASET/titanic.csv")
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### Exploratory Data Analysis

In [141]:
# dimension
df.shape

(891, 12)

In [4]:
# columns
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [142]:
# check for null values
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [143]:
# select the Age column with NaN
df[df.Age.isna()].shape

(177, 12)

In [144]:
# Fill the missing Age column
df.fillna({
    'Age': df.Age.mean()
}, inplace=True)
df.Age.isna().all()

False

In [145]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [146]:
# Check for duplicated rows
df.duplicated(subset=['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'])

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

In [147]:
# General Statistics
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [148]:
# Select needed columns
new_df = df[['Survived', 'Pclass','Sex', 'Age', 'Fare']]
new_df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.000000,7.2500
1,1,1,female,38.000000,71.2833
2,1,3,female,26.000000,7.9250
3,1,1,female,35.000000,53.1000
4,0,3,male,35.000000,8.0500
...,...,...,...,...,...
886,0,2,male,27.000000,13.0000
887,1,1,female,19.000000,30.0000
888,0,3,female,29.699118,23.4500
889,1,1,male,26.000000,30.0000


In [149]:
# Encode the Sex column
new_df.iloc[:,2] = LabelEncoder().fit_transform(new_df.iloc[:, 2])
new_df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.000000,7.2500
1,1,1,0,38.000000,71.2833
2,1,3,0,26.000000,7.9250
3,1,1,0,35.000000,53.1000
4,0,3,1,35.000000,8.0500
...,...,...,...,...,...
886,0,2,1,27.000000,13.0000
887,1,1,0,19.000000,30.0000
888,0,3,0,29.699118,23.4500
889,1,1,1,26.000000,30.0000


In [150]:
# separate columns to X and Y
x = new_df.iloc[:,1:5]
y = new_df.iloc[:,0]

#### Model Training

In [151]:
# Split Dataset for Training and Testing
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10)

In [152]:
# Applying scaling to split dataset
x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)

* LogisticRegression

In [153]:
# Train Model
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [154]:
# Predict with model
y_pred = lr.predict(x_test)

In [156]:
# Compare predicted value to actual value
pred_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred
})
pred_df.head()

Unnamed: 0,Actual,Predicted
590,0,0
131,0,0
628,0,0
195,1,1
230,1,1


#### Model Evaluation Metrics

In [157]:
# Print Accuracy Score
print("Accuracy Score \n",accuracy_score(y_test,y_pred))
print("Confusion Matrix \n", confusion_matrix(y_test,y_pred))
print("Classification Report \n", classification_report(y_test,y_pred))

Accuracy Score 
 0.8268156424581006
Confusion Matrix 
 [[102  15]
 [ 16  46]]
Classification Report 
               precision    recall  f1-score   support

           0       0.86      0.87      0.87       117
           1       0.75      0.74      0.75        62

    accuracy                           0.83       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.83      0.83      0.83       179



* Model Enhancement using KFold

In [158]:
# Applying KFold validation to LogisticRegression Model
kf = KFold(n_splits=5)
result = cross_val_score(lr,x,y, cv=kf)
print("Result",result.mean())

Result 0.7889900194589166


* DecisionTreeClassifier

In [160]:
# Apply DecisionTreeClassifier to 

# Instantiate the model
DC = DecisionTreeClassifier()

# train the model
DC.fit(x_train,y_train)

# Predict with the Model
decision_y_pred = DC.predict(x_test)

In [161]:
# Compare Actual value vs predicted value
decision_tree_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": decision_y_pred
})
decision_tree_df.head()

Unnamed: 0,Actual,Predicted
590,0,0
131,0,0
628,0,0
195,1,1
230,1,1


##### Model Evaluation

In [162]:
# Model Evaluation metrics
print("Accuracy Score \n",accuracy_score(y_test,decision_y_pred))
print("Confusion Matrix \n", confusion_matrix(y_test,decision_y_pred))
print("Classification Report \n", classification_report(y_test,decision_y_pred))

Accuracy Score 
 0.8100558659217877
Confusion Matrix 
 [[101  16]
 [ 18  44]]
Classification Report 
               precision    recall  f1-score   support

           0       0.85      0.86      0.86       117
           1       0.73      0.71      0.72        62

    accuracy                           0.81       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.81      0.81      0.81       179



* Model Enhancement using KFold

In [163]:
# Model Enhancement using KFold
dc_kf = KFold(n_splits=5)
dc_results = cross_val_score(DC,x,y,cv=dc_kf)
print("Results \n {}".format(dc_results.mean()))

Results 
 0.7856568953612454


* KNearestNeigbour Classifier

In [165]:
# Apply KNearestNeigbour Algo

# Instantiate the model
KN = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)

# Train model
KN.fit(x_train,y_train)

# Predict with model
KN_y_predict = KN.predict(x_test)

In [166]:
# Compare Actual Vs Predicted 
kn_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": KN_y_predict
})
kn_df.head()

Unnamed: 0,Actual,Predicted
590,0,0
131,0,0
628,0,0
195,1,1
230,1,1


##### Evaluation Metrics

In [167]:
# Model Evaluation metrics
print("Accuracy Score \n",accuracy_score(y_test,KN_y_predict))
print("Confusion Matrix \n", confusion_matrix(y_test,KN_y_predict))
print("Classification Report \n", classification_report(y_test,KN_y_predict))

Accuracy Score 
 0.8268156424581006
Confusion Matrix 
 [[103  14]
 [ 17  45]]
Classification Report 
               precision    recall  f1-score   support

           0       0.86      0.88      0.87       117
           1       0.76      0.73      0.74        62

    accuracy                           0.83       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.83      0.83      0.83       179



* Model Enhancement using KFold

In [168]:
kn_cv = KFold(n_splits=5)
kn_results = cross_val_score(KN,x,y,cv=kn_cv)
print("Cross Validation Score {}".format(kn_results.mean()))

Cross Validation Score 0.6914192454962024
