## Reading the CSV file

In [14]:
import pandas as pd
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


# Data Selection
## Dropping unnecessary attributes
    > Helps in model precision
    > axis => 0: row, 1: column
    > inplace => If False, return a copy. Otherwise, do operation inplace and return None.

In [15]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


# Data Classification
## Labels => (Values to be found)
## Features => (Values to be inputted)

In [16]:
labels = df.Survived
features = df.drop('Survived', axis=1) # All values except the label

# Data Correction
## Converting the 'Sex' Column into dummies and then appending it to the main df to have only int/float columns and no str columns

In [17]:
dummies = pd.get_dummies(features.Sex) # Getting dummies
features = pd.concat([features, dummies], axis=1) # Concating dummies into the main df
features.drop('Sex', axis=1, inplace=True) # Dropping the existing Sex column
features.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


# Checking for NaN objects
`Index(['Age'], dtype='object')` shows that age column has some NaN values. Lets remove them

In [18]:
features.columns[features.isna().any()] # Checking if there are any NaN values
features.Age = features.Age.fillna(features.Age.mean()) # Filling the NaN values with the mean
features.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


# Converting dataframe to array using Numpy
## Useful while cross-checking values after prediction

In [19]:
import numpy as np
features = np.array(features)
labels = np.array(labels)

# Splitting data into training and testing
## test_size=0.2: converts 20% of the actual data into test data

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

# Creating Model
## Class used -> sklearn.naive_bayes.GaussianNB

In [21]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [22]:
model.fit(X_train, y_train)

GaussianNB()

In [23]:
model.score(X_test, y_test) # Accuracy of the model

0.7541899441340782

# Predicting the labels using `.predict()` method
    > Prediction => value predicted by the model
    > Actual => actual values from the y_test array
    > Equal => Checks if prediction is correct. (True == correct prediction | False == wrong prediction)

In [24]:
predictions = model.predict(X_test)
for x in range(len(predictions)):
    print(f"Prediction: {predictions[x]} Actual: {y_test[x]} Equal: {predictions[x] == y_test[x]}")


Prediction: 1 Actual: 1 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 1 Equal: False
Prediction: 1 Actual: 0 Equal: False
Prediction: 1 Actual: 0 Equal: False
Prediction: 1 Actual: 1 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 1 Actual: 1 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 1 Actual: 1 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 1 Actual: 1 Equal: True
Prediction: 1 Actual: 1 Equal: True
Prediction: 1 Actual: 0 Equal: False
Prediction: 0 Actual: 0 Equal: True
Prediction: 1 Actual: 0 Equal: False
Prediction: 1 Actual: 1 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 0 Actual: 0 Equal: True
Prediction: 1 Actual: 0 Equal: False
Prediction: 1 Actual: 