In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [28]:
df = pd.read_csv('titanic-1.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [29]:
# Checking Missing Values
print(df.isnull().sum())

PassengerId      0
Name             0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Survived         0
dtype: int64


In [30]:
# Fill missing values in 'Age' with the median
df['Age'].fillna(df['Age'].median(), inplace=True)

In [31]:
# Drop rows with missing values in 'Embarked'
df.dropna(subset=['Embarked'], inplace=True)

In [32]:
# Checking Missing Values
print(df.isnull().sum())

PassengerId      0
Name             0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Survived         0
dtype: int64


In [49]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)

In [33]:
discrete_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
continuous_features = ['Age', 'Fare']

In [34]:
print("Discrete Features:", discrete_features)
print("Continuous Features:", continuous_features)

Discrete Features: ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
Continuous Features: ['Age', 'Fare']


In [35]:
# Calculate the prior probabilities for each class in the 'Survived' column
prior_probabilities = df['Survived'].value_counts(normalize=True)

print("Prior Probabilities for each class in 'Survived':")
print(prior_probabilities)

Prior Probabilities for each class in 'Survived':
Survived
0    0.617548
1    0.382452
Name: proportion, dtype: float64


In [36]:
# Calculate Conditional Probabilities for Discrete Features
conditional_probabilities = {}
for feature in discrete_features:
    contingency_table = pd.crosstab(df[feature], df['Survived'])
    conditional_probabilities[feature] = (contingency_table.div(contingency_table.sum(axis=1), axis=0))

print("\nConditional Probabilities for Discrete Features using crosstab:")
for feature in conditional_probabilities:
    print(f"\n{feature}:")
    print(conditional_probabilities[feature])


Conditional Probabilities for Discrete Features using crosstab:

Pclass:
Survived         0         1
Pclass                      
1         0.373832  0.626168
2         0.527174  0.472826
3         0.757637  0.242363

Sex:
Survived         0         1
Sex                         
female    0.259615  0.740385
male      0.811092  0.188908

SibSp:
Survived         0         1
SibSp                       
0         0.656766  0.343234
1         0.464115  0.535885
2         0.535714  0.464286
3         0.750000  0.250000
4         0.833333  0.166667
5         1.000000  0.000000
8         1.000000  0.000000

Parch:
Survived         0         1
Parch                       
0         0.658284  0.341716
1         0.449153  0.550847
2         0.500000  0.500000
3         0.400000  0.600000
4         1.000000  0.000000
5         0.800000  0.200000
6         1.000000  0.000000

Embarked:
Survived         0         1
Embarked                    
C         0.446429  0.553571
Q         0.610390  0.3

In [47]:
# Calculate mean and variance for continuous features
mean_variance = {}
for feature in continuous_features:
    mean_variance[feature] = df.groupby('Survived')[feature].agg(['mean', 'var'])

# Print mean and variance
print("Mean and Variance for Continuous Features:")
for feature, stats in mean_variance.items():
    mean_value = stats['mean'].iloc[0]  # Access the first value
    variance_value = stats['var'].iloc[0]  # Access the first value
    print(f"{feature} - Mean: {mean_value:.2f}, Variance: {variance_value:.2f}")

Mean and Variance for Continuous Features:
Age - Mean: 30.03, Variance: 156.25
Fare - Mean: 22.12, Variance: 985.22
