In [228]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

In [229]:
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
dataset = pd.DataFrame()
dataset = pd.read_csv(url)

In [230]:
# Data Cleaning
dataset.drop(['Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)
dataset.isnull().sum()

dataset["Age"].fillna(dataset["Age"].mean(), inplace=True)
dataset["Embarked"].fillna(dataset["Embarked"].fillna(method ='pad'), inplace=True)

In [231]:
# Data Analysis
nMales = len(dataset[dataset["Sex"] == 'male'])
print("\nNo. of males in Titanic: ", nMales)

nFemales = len(dataset[dataset["Sex"] == 'female'])
print("\nNo. of females in Titanic: ", nFemales)

alive = len(dataset[dataset["Survived"] == 1])
dead = len(dataset[dataset["Survived"] == 0])
print("\nNo. of people alive: ", alive)
print("\nNo. of people dead: ", dead)


No. of males in Titanic:  577

No. of females in Titanic:  314

No. of people alive:  342

No. of people dead:  549


In [232]:
# Gender-wise survival probabilites
surv = dataset.groupby("Sex")["Survived"].sum()
print("\nNo. of males and females survived: ", surv, sep='\n')
print("\nProbabilities of survival for both genders:")
print("Male: ", (surv['male']/alive))
print("Female: ", (surv['female']/alive))


No. of males and females survived: 
Sex
female    233
male      109
Name: Survived, dtype: int64

Probabilities of survival for both genders:
Male:  0.31871345029239767
Female:  0.6812865497076024


In [233]:
# Passenger Class-wise survival probabilities
classes = dataset.groupby("Pclass")["Survived"].sum()
print("Probabilities of aliveness according to different Passenger Classes:")
c1 = classes[1]/alive
c2 = classes[2]/alive
c3 = classes[3]/alive
print("Class 1: ", c1, "\nClass 2: ", c2, "\nClass 3: ", c3)

Probabilities of aliveness according to different Passenger Classes:
Class 1:  0.39766081871345027 
Class 2:  0.2543859649122807 
Class 3:  0.347953216374269


In [236]:
# Survival probabilities based on no. of siblings and spouse
nSibsp = dataset.groupby("SibSp")["Survived"].sum().sort_values( ascending=False)
print("Survival probabilities based on no. of siblins and spouse: ")
print("\nWith 0 Siblings and spouse: ", nSibsp[0]/alive)
print("\nWith 1 Siblings and spouse: ", nSibsp[1]/alive)
print("\nWith 2 Siblings and spouse: ", nSibsp[2]/alive)
print("\nWith 3 Siblings and spouse: ", nSibsp[3]/alive)
print("\nWith 4 Siblings and spouse: ", nSibsp[4]/alive)
print("\nWith more than 4 Siblings and spouse: ", nSibsp[5]/alive)

Survival probabilities based on no. of siblins and spouse: 

With 0 Siblings and spouse:  0.6140350877192983

With 1 Siblings and spouse:  0.32748538011695905

With 2 Siblings and spouse:  0.038011695906432746

With 3 Siblings and spouse:  0.011695906432748537

With 4 Siblings and spouse:  0.008771929824561403

With more than 4 Siblings and spouse:  0.0


In [238]:
# Survival probabilities based on embarking port
embPort = dataset.groupby("Embarked")["Survived"].sum()
print("Survival rate based on embarking port:")
print("\nSouthampton: ", embPort['S']/alive)
print("\nCherbourg: ", embPort['C']/alive)
print("\nQueenstown: ", embPort['Q']/alive)

Survival rate based on embarking port:

Southampton:  0.6345029239766082

Cherbourg:  0.27485380116959063

Queenstown:  0.09064327485380116


In [239]:
# Data Preperation
sex = {'male':1, 'female': 0}
dataset["Sex"] = dataset["Sex"].apply(lambda x:sex[x])

e = {'C':0, 'Q':1 ,'S':2}
dataset["Embarked"] = dataset["Embarked"].apply(lambda x:e[x])
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [240]:
# Features Selection
features = ['Age','Pclass','SibSp','Parch','Fare','Sex','Embarked']

In [241]:
# Scaling the data
from sklearn.preprocessing import MinMaxScaler
X = dataset[features]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = dataset['Survived']
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.3)

In [244]:
# Model Selection
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(xTrain, yTrain)
prediction = model.predict(xTest)

# Accuracy Calculation
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(prediction, yTest)
print("Accuracy for Gaussian Naive Bayes model: ", accuracy)

print("\nWith the given dataset and the analysis which I have done, one can say that survival rates are max for a person with following:")
print("Age-Group: 20 - 30")
print("Sex: Female")
print("Passenger Class: 1")
print("No. of siblings and spouses: 0")
print("Embarking Port: Southampton")

Accuracy for Gaussian Naive Bayes model:  0.8208955223880597

With the given dataset and the analysis which I have done, one can say that survival rates are max for a person with following:
Age-Group: 20 - 30
Sex: Female
Passenger Class: 1
No. of siblings and spouses: 0
Embarking Port: Southampton
