In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import style
style.use('dark_background')
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Course Material/Supervised Machine Learning/Bayes Theorem and Naive Bayes Classifier

/content/drive/MyDrive/Course Material/Supervised Machine Learning/Bayes Theorem and Naive Bayes Classifier


# Importing Data

In [None]:
train = pd.read_csv('titanic_train.csv')
train.shape

(891, 12)

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Embarked implies where the traveler mounted from. There are three possible values for Embark
### Southampton, Cherbourg, and Queenstown.

### SibSp is the number of siblings or spouse of a person onboard

###  Parch is the feature contained the number of parents or children each passenger was touring with.

# Check the Number of passengers died.

In [None]:
train['Survived'].value_counts()  # 0 = died, 1 = lived  since 60% of passengers died

0    549
1    342
Name: Survived, dtype: int64

# Droping Irrelevant Features

In [None]:
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace = True) # Removing as of no importance
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Checking For Null Values

In [None]:
train.isnull().any()

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked     True
dtype: bool

# Replacing Null Values in "Age" with mean value of "Age"

In [None]:
train.fillna(value = { "Age" : train['Age'].mean()}, inplace = True)

In [None]:
train.isnull().any()

Survived    False
Pclass      False
Sex         False
Age         False
SibSp       False
Parch       False
Fare        False
Embarked     True
dtype: bool

# Drop all the rows containing Null Values

In [None]:
train.dropna(inplace=True)

In [None]:
train.isnull().any()

Survived    False
Pclass      False
Sex         False
Age         False
SibSp       False
Parch       False
Fare        False
Embarked    False
dtype: bool

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Creating Dummy Variables and dropping first column to avoid multicollinearity

In [None]:
Sex = pd.get_dummies(train['Sex'],drop_first = True)
Embark = pd.get_dummies(train['Embarked'],drop_first = True)

# Adding dummy variables to original data

In [None]:
train = pd.concat([train, Sex, Embark], axis = 1)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,male,Q,S
0,0,3,male,22.0,1,0,7.25,S,1,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,0,0,1
3,1,1,female,35.0,1,0,53.1,S,0,0,1
4,0,3,male,35.0,0,0,8.05,S,1,0,1


# Removing Sex and Embarked because their dummy variables are created

In [None]:
train.drop(['Sex', 'Embarked'], axis=1, inplace = True)
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


# Extracting features and labels

In [None]:
X = train.drop(['Survived'], axis = 1).values

In [None]:
y = train['Survived'].values

# Performing a train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_test.shape

(178, 8)

# Standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Applying Logistic Regression

In [None]:
model = LogisticRegression()
LR = model.fit(X_train, y_train)

In [None]:
y_pred_LR = LR.predict(X_test)

# Calculating Accuracy

In [None]:
Accuracy = np.mean(y_pred_LR == y_test) * 100
print("Accuracy : %2.2f" % (Accuracy))

Accuracy : 77.53


# Applying Naive Baye's Classifier

In [None]:
NB = GaussianNB()
NB.fit(X_train, y_train)

In [None]:
y_pred_NB = NB.predict(X_test)

In [None]:
Accuracy = np.mean(y_pred_NB == y_test) * 100
print("Accuracy : %2.2f" % (Accuracy))

Accuracy : 77.53


# Predict Class probabilities

In [None]:
y_pred_prob_LR = LR.predict_proba(X_test).round(2)

In [None]:
y_pred_prob_NB = NB.predict_proba(X_test).round(2)

In [None]:
df = pd.DataFrame({"NB_class_0": y_pred_prob_NB[:,0]
,"NB_class_1": y_pred_prob_NB[:,1]
,"NB Prediction": y_pred_NB
,"LR_class_0": y_pred_prob_LR[:,0]
,"LR_class_1": y_pred_prob_LR[:,1]
,"LR_Prediction": y_pred_LR})
df.head(40)

Unnamed: 0,NB_class_0,NB_class_1,NB Prediction,LR_class_0,LR_class_1,LR_Prediction
0,0.95,0.05,0,0.9,0.1,0
1,0.0,1.0,1,0.04,0.96,1
2,0.12,0.88,1,0.22,0.78,1
3,0.9,0.1,0,0.76,0.24,0
4,0.01,0.99,1,0.04,0.96,1
5,0.95,0.05,0,0.9,0.1,0
6,0.89,0.11,0,0.72,0.28,0
7,0.94,0.06,0,0.87,0.13,0
8,0.0,1.0,1,0.07,0.93,1
9,0.0,1.0,1,0.03,0.97,1


# Applying PCA to create uncorrelated features

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)

In [None]:
X_pca_test.shape

(178, 2)

# Check the correlation between the features

In [None]:
cor = np.corrcoef(X_pca_train[:,0], X_pca_train[:,1]).round(2)
cor

array([[ 1., -0.],
       [-0.,  1.]])

# Applying Logistic Regression on uncorrelated data

In [None]:
LRPCA = LogisticRegression()
LRPCA.fit(X_pca_train, y_train)

In [None]:
y_pred_pca = LRPCA.predict(X_pca_test)

In [None]:
Accuracy = np.mean(y_pred_pca == y_test) * 100
print("Accuracy : %2.2f" % (Accuracy))

Accuracy : 70.22


# Applying Naive Baye's on uncorrelated data

In [None]:
NB_model_PCA = GaussianNB()
NB_model_PCA.fit(X_pca_train, y_train)

In [None]:
y_pred_NBPCA = NB_model_PCA.predict(X_pca_test)

In [None]:
Accuracy = np.mean(y_pred_NBPCA == y_test) * 100
print("Accuracy : %2.2f" % (Accuracy))

Accuracy : 72.47
