# **Naïve Bayes with Titanic dataset.**

# Importing Required Libraries and Loading dataset

In [1]:
import pandas as pd

In [2]:
df= pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


# Dropping Unnecessary Columns

In [3]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


# Separating Target and Input Features

In [4]:
target = df.Survived
# print(target)
inputs = df.drop('Survived', axis='columns')
# print(inputs)

# Creating Dummy Variables for Categorical Data

In [5]:
# dummies =pd.get_dummies(inputs.Sex)
dummies = pd.get_dummies(inputs.Sex, dtype=int)
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


# Concatenating Dummy Variables with Input Features

In [6]:
inputs = pd.concat( [inputs,dummies], axis ='columns' )
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


# Dropping Redundant Columns

In [7]:
inputs.drop( ['Sex','male'], axis='columns',inplace=True)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


# Checking for Missing Values

In [8]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [16]:
# Check for Missing Values Column-Wise
print(inputs.isnull().sum())
# print(df.isnull().sum())


Pclass      0
Age       177
Fare        0
female      0
dtype: int64


In [None]:
inputs.Age[:10]

Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
5,
6,54.0
7,2.0
8,27.0
9,14.0


# Handling Missing Values in Age Column

In [None]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head(10)

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0
5,3,29.699118,8.4583,0
6,1,54.0,51.8625,0
7,3,2.0,21.075,0
8,3,27.0,11.1333,1
9,2,14.0,30.0708,1


# Splitting Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.2)

In [None]:
X_train.head()

Unnamed: 0,Pclass,Age,Fare,female
839,1,29.699118,29.7,0
226,2,19.0,10.5,0
465,3,38.0,7.05,0
121,3,29.699118,8.05,0
336,1,29.0,66.6,0


# Initializing and Training the Gaussian Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB
model= GaussianNB()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

0.770949720670391

In [None]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female
13,3,39.0,31.275,0
378,3,20.0,4.0125,0
590,3,35.0,7.125,0
716,1,38.0,227.525,1
328,3,31.0,20.525,1
141,3,22.0,7.75,1
124,1,54.0,77.2875,0
761,3,41.0,7.125,0
835,1,39.0,83.1583,1
348,3,3.0,15.9,0


In [None]:
y_test[:10]

Unnamed: 0,Survived
13,0
378,0
590,0
716,1
328,1
141,1
124,0
761,0
835,1
348,1


# Making Predictions on Test Data (First 10 Rows)

In [None]:
model.predict(X_test[:10])

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0])

# Displaying Prediction Probabilities

In [None]:
model.predict_proba(X_test[:10])

array([[9.65134204e-01, 3.48657962e-02],
       [9.65633037e-01, 3.43669635e-02],
       [9.65562045e-01, 3.44379554e-02],
       [9.60216061e-01, 3.97839385e-02],
       [4.03306919e-01, 5.96693081e-01],
       [9.60730192e-01, 3.92698079e-02],
       [7.83406503e-01, 2.16593497e-01],
       [9.65183270e-01, 3.48167304e-02],
       [5.38764654e-12, 1.00000000e+00],
       [9.64795632e-01, 3.52043675e-02]])