# **Titanic Survival Prediction using Naive Bayes**

## **Importing necessray Libraries**

In [1]:
import pandas as pd #to load dataset
import numpy as np #to perform array operations

## **Choose Dataset from Local Directory**

In [2]:
from google.colab import files
uploaded = files.upload()

Saving titanicsurvival.csv to titanicsurvival.csv


## **Load Dataset**

In [3]:
dataset = pd.read_csv('titanicsurvival.csv')
dataset

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.2500,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.9250,1
3,1,female,35.0,53.1000,1
4,3,male,35.0,8.0500,0
...,...,...,...,...,...
886,2,male,27.0,13.0000,0
887,1,female,19.0,30.0000,1
888,3,female,,23.4500,0
889,1,male,26.0,30.0000,1


## **Dataset Summarization**

In [4]:
print(dataset.shape)
print(dataset.head(5))

(891, 5)
   Pclass     Sex   Age     Fare  Survived
0       3    male  22.0   7.2500         0
1       1  female  38.0  71.2833         1
2       3  female  26.0   7.9250         1
3       1  female  35.0  53.1000         1
4       3    male  35.0   8.0500         0


## **Mapping Text Data to Binary Values**

In [5]:
income_set = set(dataset['Sex'])
dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
print(dataset.head(5))

   Pclass  Sex   Age     Fare  Survived
0       3    1  22.0   7.2500         0
1       1    0  38.0  71.2833         1
2       3    0  26.0   7.9250         1
3       1    0  35.0  53.1000         1
4       3    1  35.0   8.0500         0


## **Segregate dataset into input and output**

In [6]:
x = dataset.drop('Survived', axis='columns')
print(x)
y = dataset.Survived
print(y)

     Pclass  Sex   Age     Fare
0         3    1  22.0   7.2500
1         1    0  38.0  71.2833
2         3    0  26.0   7.9250
3         1    0  35.0  53.1000
4         3    1  35.0   8.0500
..      ...  ...   ...      ...
886       2    1  27.0  13.0000
887       1    0  19.0  30.0000
888       3    0   NaN  23.4500
889       1    1  26.0  30.0000
890       3    1  32.0   7.7500

[891 rows x 4 columns]
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


## **Finding and Replace Empty values**

In [7]:
x.columns[x.isna().any()]

Index(['Age'], dtype='object')

In [8]:
x.Age = x.Age.fillna(x.Age.mean())

## **Again Check for Empty Values**

In [9]:
x.columns[x.isna().any()]

Index([], dtype='object')

## **Splitting Dataset for Training and Testing**

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

## **Model Training**

In [11]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train, y_train)

## **Prediction for All Test Data**

In [12]:
y_pred = model.predict(x_test)

## **Model Evaluation**

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Accuracy of the Model : {0}".format(accuracy_score(y_test, y_pred)*100))

[[110  29]
 [ 21  63]]
Accuracy of the Model : 77.57847533632287


## **Manual Testing of Model**

In [14]:
pclassNo = int(input("Enter Person's Pclass no.: "))
sex = int(input("Enter Person's Sex (female-0 , male-1) : "))
age = int(input("Enter Person's Age : "))
fare = int(input("Enter Person's Fare : "))

person = [[pclassNo, sex, age, fare]]
result = model.predict(person)
print(result)

if result == 1:
  print("Person Survived")
else:
  print("Person Died")



Enter Person's Pclass no.: 2
Enter Person's Sex (female-0 , male-1) : 1
Enter Person's Age : 45
Enter Person's Fare : 30000
[1]
Person Survived




## **Evaluating Some ML Algorithm by its Model-Accuracy Score**

In [15]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


In [16]:
models = []

models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', svm.SVC(gamma = 'auto')))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))

print(models)

[('LDA', LinearDiscriminantAnalysis()), ('LR', LogisticRegression(multi_class='ovr', solver='liblinear')), ('KNN', KNeighborsClassifier()), ('SVM', SVC(gamma='auto')), ('NB', GaussianNB()), ('CART', DecisionTreeClassifier())]


In [None]:
results = []
names = []
res = []

for name, model in models:
  skfold = StratifiedKFold(n_splits=10)
  cv_results = cross_val_score(model, x, y, cv = skfold, scoring = 'accuracy')
  results.append(cv_results)
  names.append(names)
  res.append(cv_results.mean())
  print('%s : %f' % (name, cv_results.mean()))

import matplotlib.pyplot as plt
plt.ylim(.900, .999)
plt.bar(names, res, color = 'pink', width = 0.6)

plt.title('Algorithm Comparison')
plt.show()

LDA : 0.786742
LR : 0.792347
KNN : 0.708277
SVM : 0.688065
NB : 0.776667
CART : 0.778926
