<a href="https://colab.research.google.com/github/RifatMuhtasim/Machine_Learning/blob/main/Naive_Bayes/Titanic_dataset_with_GaussianNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

## Data Import and Load

In [18]:
df = pd.read_csv("https://raw.githubusercontent.com/codebasics/py/master/ML/14_naive_bayes/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [19]:
# Select the Target Variable

target = df['Survived']
display(target.head())

print("NAN:", target[target.isna() == True])

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

NAN: Series([], Name: Survived, dtype: int64)


In [20]:
# Correlation
df.corr()

  df.corr()


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived
PassengerId,1.0,-0.035144,0.036847,-0.057527,-0.001652,0.012658,-0.005007
Pclass,-0.035144,1.0,-0.369226,0.083081,0.018443,-0.5495,-0.338481
Age,0.036847,-0.369226,1.0,-0.308247,-0.189119,0.096067,-0.077221
SibSp,-0.057527,0.083081,-0.308247,1.0,0.414838,0.159651,-0.035322
Parch,-0.001652,0.018443,-0.189119,0.414838,1.0,0.216225,0.081629
Fare,0.012658,-0.5495,0.096067,0.159651,0.216225,1.0,0.257307
Survived,-0.005007,-0.338481,-0.077221,-0.035322,0.081629,0.257307,1.0


In [21]:
inputs = df[['Pclass', 'Sex', 'Age', 'Parch', 'Fare']]
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare
0,3,male,22.0,0,7.25
1,1,female,38.0,0,71.2833
2,3,female,26.0,0,7.925
3,1,female,35.0,0,53.1
4,3,male,35.0,0,8.05


## Handle the Missing Value

In [22]:
# Check the NAN Value
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [23]:
inputs[inputs['Age'].isna() == True]

Unnamed: 0,Pclass,Sex,Age,Parch,Fare
5,3,male,,0,8.4583
17,2,male,,0,13.0000
19,3,female,,0,7.2250
26,3,male,,0,7.2250
28,3,female,,0,7.8792
...,...,...,...,...,...
859,3,male,,0,7.2292
863,3,female,,2,69.5500
868,3,male,,0,9.5000
878,3,male,,0,7.8958


In [24]:
inputs['Age'] = inputs['Age'].fillna(inputs['Age'].mean())
print("NAN Value:", inputs[inputs['Age'].isna() == True])

inputs.head(6)

NAN Value: Empty DataFrame
Columns: [Pclass, Sex, Age, Parch, Fare]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputs['Age'] = inputs['Age'].fillna(inputs['Age'].mean())


Unnamed: 0,Pclass,Sex,Age,Parch,Fare
0,3,male,22.0,0,7.25
1,1,female,38.0,0,71.2833
2,3,female,26.0,0,7.925
3,1,female,35.0,0,53.1
4,3,male,35.0,0,8.05
5,3,male,29.699118,0,8.4583


## Create Sex Dummay Variable

In [25]:
dummies = pd.get_dummies(inputs['Sex'])
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [26]:
labels = pd.concat([inputs, dummies], axis="columns")
labels.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,female,male
0,3,male,22.0,0,7.25,0,1
1,1,female,38.0,0,71.2833,1,0
2,3,female,26.0,0,7.925,1,0
3,1,female,35.0,0,53.1,1,0
4,3,male,35.0,0,8.05,0,1


In [27]:
labels.drop(['Sex'], axis="columns", inplace=True)
labels.columns

Index(['Pclass', 'Age', 'Parch', 'Fare', 'female', 'male'], dtype='object')

## GaussianNB Model

In [28]:
X_train, X_test, y_train, y_test = train_test_split(labels, target, test_size = 0.25)

print("Train Dataset Length: ", len(X_train))
print("Test Dataset Length: ", len(X_test))

Train Dataset Length:  668
Test Dataset Length:  223


In [29]:
# Naive Bayes Model
model = GaussianNB()
model.fit(X_train, y_train)

In [30]:
# Accuracy
model.score(X_test, y_test)

0.7892376681614349

In [31]:
# Predict
model.predict(X_test[: 10])

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

In [32]:
# Predict Probability
model.predict_proba(X_test[: 10])

array([[9.88532647e-01, 1.14673529e-02],
       [9.84979034e-01, 1.50209663e-02],
       [9.87560127e-01, 1.24398726e-02],
       [9.89791622e-01, 1.02083782e-02],
       [6.65178697e-01, 3.34821303e-01],
       [9.89407046e-01, 1.05929539e-02],
       [9.77772868e-01, 2.22271318e-02],
       [7.85996906e-04, 9.99214003e-01],
       [5.03870107e-03, 9.94961299e-01],
       [9.76642847e-01, 2.33571527e-02]])