In [1]:
'''
We represent each sample used for generating the decision tree as a Training Instance

Each instance has the corresponding features: Pclass, Sex, Age, Siblings/Spouses Aboard, Parents/Children Aboard,
     Fare, and Survived
     
'''
class TrainingInstance:
    

    #Input: Pclass, Sex (gender), Age, Siblings/Spouses Aboard, Parents/Children Aboard, Fare, and Survived
    def __init__(self, p_class, sex, age, siblings_Spouses, parents_Children, fare, survived):
        self.p_class = p_class
        self.sex = sex
        self.age = age
        self.siblings_Spouses = siblings_Spouses
        self.parents_Children = parents_Children
        self.fare = fare
        self.survived = survived

    
    def __str__(self):
        return "p_class: {}, sex: {}, age: {}, sibling_spouses: {}, parents_children: {}, fare: {}, survived: {}".format(self.p_class, self.sex, self.age, self.siblings_Spouses, self.parents_Children, self.fare, self.survived)
    

In [2]:
'''
For a feature vector that we want a prediction for using the decision tree, we will represent it as a Test Instance
The difference from training instance is that we don't have the survived parameter in this case
'''
class TestInstance:
    
    #Input: Pclass, Sex (gender), Age, Siblings/Spouses Aboard, Parents/Children Aboard, Fare
     def __init__(self, p_class, sex, age, siblings_Spouses, parents_Children, fare):
        self.p_class = p_class
        self.sex = sex
        self.age = age
        self.siblings_Spouses = siblings_Spouses
        self.parents_Children = parents_Children
        self.fare = fare
    
     def __str__(self):
        return "p_class: {}, sex: {}, age: {}, sibling_spouses: {}, parents_children: {}, fare: {}".format(self.p_class, self.sex, self.age, self.siblings_Spouses, self.parents_Children, self.fare)
    

In [3]:
'''
compute_discrete_conditional is a function that will be used to compute conditional probability for
       discrete variables

Input: 1) instances - a list containing the training instances

       2) attribute - the attribute for which we want to compute the conditional probability
       
       3) attribute_value - the attribute value (1 or 0)

Output: a list containing:

1) the number of people that survived with the corresponding attribute value

2) the number of people died with the corresponding attribute value
'''
def compute_discrete_conditional(instances, attribute, attribute_value):
    
    num_survived_attribute = 0
    num_deceased_attribute = 0
    
    
    if(attribute == 'p_class'):
        
        for elem in instances:
            
            if(elem.p_class == attribute_value):
                if(elem.survived == 1):
                    num_survived_attribute += 1
                else:
                    num_deceased_attribute += 1
    
    elif(attribute == 'sex'):
        
         for elem in instances:
            
            if(elem.sex == attribute_value):
                if(elem.survived == 1):
                    num_survived_attribute += 1
                else:
                    num_deceased_attribute += 1
    
    elif(attribute == 'siblings_Spouses'):
        
        for elem in instances:
            
            if(elem.siblings_Spouses == attribute_value):
                if(elem.survived == 1):
                    num_survived_attribute += 1
                else:
                    num_deceased_attribute += 1
    
    elif(attribute == 'parents_Children'):
        
        for elem in instances:
            
            if(elem.parents_Children == attribute_value):
                if(elem.survived == 1):
                    num_survived_attribute += 1
                else:
                    num_deceased_attribute += 1
                

    
    ret_list = [num_survived_attribute, num_deceased_attribute]
    
    return ret_list

In [4]:
'''
compute_mean_estimate is a function that helps estimate the mean for features that
are modelled using gaussian

Input: 1) instances: a list containing the training instances

       2) attribute: the attribute for which we want to compute the mean

Output: a list containing:

1) the sum of attribute values for people that have survived

2) the sum of attribute values for people that have died
'''
def compute_mean_estimate(instances, attribute):
    
    attribute_total_survived = 0
    attribute_total_deceased = 0
    
    
    if(attribute == 'age'):
        
        for elem in instances:
            
            if(elem.survived == 1):
                attribute_total_survived += elem.age
            else:
                attribute_total_deceased += elem.age
    
  
    elif(attribute == 'fare'):
        
        for elem in instances:
            
            if(elem.survived == 1):
                attribute_total_survived += elem.fare
            else:
                attribute_total_deceased += elem.fare
    
    
    
    ret_list = [attribute_total_survived, attribute_total_deceased]
    
    return ret_list

In [5]:
'''
compute_variance_estimate is a function that helps estimate the variance for features that are 
modelled using gaussian

Input: 1) instances: a list of the training instances

       2) attribute: the attribute for which we want to compute the variance
       
       3) mean_estimate_survived: the attribute's mean_estimate for the people that have survived
       
       4) mean_estimate_deceased: the attribute's mean_estimate for the people that have died

Output: a list containing:

1) the sum of (xi - mean_estimate_survived)^2 for the people that have survived with respect to the attribute

2) the sum of (xi - mean_estimate_deceased)^2 for the people that have died with respect to the attribute
'''
def compute_variance_estimate(instances, attribute, mean_estimate_survived, mean_estimate_deceased):
    
    variance_total_survived = 0
    variance_total_deceased = 0
    
    if(attribute == 'age'):
        
        for elem in instances:
            
            if(elem.survived == 1):
                variance_total_survived += pow(abs(elem.age - mean_estimate_survived), 2)
            else:
                variance_total_deceased += pow(abs(elem.age - mean_estimate_deceased), 2)
    
    
    elif(attribute == 'fare'):
        
        for elem in instances:
            
            if(elem.survived == 1):
                variance_total_survived += pow(abs(elem.fare - mean_estimate_survived),2)
            else:
                variance_total_deceased += pow(abs(elem.fare - mean_estimate_deceased),2)
    
    
    
    ret_list = [variance_total_survived, variance_total_deceased]
    
    return ret_list
    

In [6]:
'''
prediction is a function to make a prediction for a test_instance

Input: 1) test_instance: the test instance for which we are computing the prediction

       2) y_params: the parameters for y, which includes P(y = survived) and P(y = deceased)
       
       3) p_class_params: the parameters for p_class, which includes P(p_class = 1 | survived), P(p_class = 1 | deceased),
              P(p_class = 2 | survived), P(p_class = 2 | deceased), P(p_class = 3 | survived) and P(p_class = 3 | deceased)
        
       4) sex_params: the paramaters for sex, which includes P(sex = male | survived), P(sex = male | deceased), 
                      P(sex = female, | survived), and P(sex = female | deceased)
    
       5) age_params: the parameters for age, which includes mean_age_survived, mean_age_deceased, variance_age_survived, and
                      variance_age_deceased
       
       6) siblings_spouses_params: the paramaters for siblings_spouses, which includes mean_sibsps_survived,
                                    variance_sibsps_survived, mean_sibsps_deceased, variance_sibsps_deceased
        
       7) parents_children_params: the paramters for parents_children, which includes mean_parchd_survived,
                                   variance_parchd_survived, mean_parchd_deceased, variance_parchd_deceased
        
       8) fare_params: the paramaters for fare, which includes mean_fare_survived, variance_fare_survived, 
                       mean_fare_deceased, variance_fare_deceased


Output: the prediction for the test instance
'''
def prediction(test_instance, y_params, p_class_params, sex_params, age_params, siblings_spouses_params, parents_children_params, fare_params):
    
    # Compute P(y = survived | x)
    
    # Computing P(x | y = survived)
    
    p_class_survived_params = p_class_params[0]
    p_class_prob_survived = p_class_survived_params[int(test_instance.p_class) - 1]
    
    sex_survived_params = sex_params[0]
    sex_prob_survived = sex_survived_params[int(test_instance.sex)]
    
    age_survived_params = age_params[0]
    age_prob_survived = scipy.stats.norm(age_survived_params[0], math.sqrt(age_survived_params[1])).pdf(test_instance.age)
        
    sibsps_survived_params = siblings_spouses_params[0]
    sibsps_prob_survived = sibsps_survived_params[int(test_instance.siblings_Spouses)]
    
    parchd_survived_params = parents_children_params[0]
    parchd_prob_survived = parchd_survived_params[int(test_instance.parents_Children)]
    
    fare_survived_params = fare_params[0]
    fare_prob_survived = scipy.stats.norm(fare_survived_params[0], math.sqrt(fare_survived_params[1])).pdf(test_instance.fare)
    
    
    
    # P(y) * P(x | y = Survived) 
    
    y_survived = y_params[0]
    prob_survived_x = (y_survived * p_class_prob_survived * sex_prob_survived * age_prob_survived * sibsps_prob_survived * parchd_prob_survived * fare_prob_survived)
    
    print('Computing probabilities associated with survival')
    print('P(y = survived): ' + str(y_survived))
    print('P(p_class | survived): ' + str(p_class_prob_survived))
    print('P(sex | survived): ' + str(sex_prob_survived))
    print('P(age | survived): ' + str(age_prob_survived))
    print('P(siblings_spouses_aboard | survived): ' + str(sibsps_prob_survived))
    print('P(parents_children_aboard | survived): ' + str(parchd_prob_survived))
    print('P(fare | survived): ' + str(fare_prob_survived))
    print('P(survived | x): ' + str(prob_survived_x))
    
    
    
    # Compute P(y = deceased / x)
    
    # Computing P(x | y = deceased)
    
    p_class_deceased_params = p_class_params[1]
    p_class_prob_deceased = p_class_deceased_params[int(test_instance.p_class) - 1]
    
    sex_deceased_params = sex_params[1]
    sex_prob_deceased = sex_deceased_params[int(test_instance.sex)]
    
    age_deceased_params = age_params[1]
    age_prob_deceased = scipy.stats.norm(age_deceased_params[0], math.sqrt(age_deceased_params[1])).pdf(test_instance.age)
        
    
    sibsps_deceased_params = siblings_spouses_params[1]
    sibsps_prob_deceased = sibsps_deceased_params[int(test_instance.siblings_Spouses)]
    
    parchd_deceased_params = parents_children_params[1]
    parchd_prob_deceased = parchd_deceased_params[int(test_instance.parents_Children)]
   
    fare_deceased_params = fare_params[1]
    fare_prob_deceased = scipy.stats.norm(fare_deceased_params[0], math.sqrt(fare_deceased_params[1])).pdf(test_instance.fare)
    
    
    # P(y) * P(x | y = Survived) 
    
    y_deceased = y_params[1]
    prob_deceased_x = (y_deceased * p_class_prob_deceased * sex_prob_deceased * age_prob_deceased * sibsps_prob_deceased *parchd_prob_deceased * fare_prob_deceased)
    
    print()
    print('computing probabilities associated with dying')
    print('P(y = deceased): ' + str(y_deceased))
    print('P(p_class | deceased): ' + str(p_class_prob_deceased))
    print('P(sex | deceased): ' + str(sex_prob_deceased))
    print('P(age | deceased): '+ str(age_prob_deceased))
    print('P(siblings_spouses_aboard | deceased): '+ str(sibsps_prob_deceased))
    print('P(parents_children_aboard | deceased): ' + str(parchd_prob_deceased))
    print('P(fare | deceased): '+ str(fare_prob_deceased))
    print('P(deceased | x): '+ str(prob_deceased_x))
    print()
    #Make prediction
  
    if(prob_survived_x > prob_deceased_x):
        return 1
    else:
        return 0

In [7]:
import pandas as pd
import numpy as np
import math
import scipy.stats

'''
Load Dataset
'''
df = pd.read_csv('titanic_data.csv')
df.rename(columns = {"Siblings/Spouses Aboard": "Siblings_Spouses_Aboard"}, inplace = True)
df.rename(columns = {"Parents/Children Aboard": "Parents_Children_Aboard"}, inplace = True)

'''
Feature Transformation for Siblings_Spouses_Aboard
'''
df.loc[df.Siblings_Spouses_Aboard == 0, "Siblings_Spouses_Aboard"] = 0
df.loc[df.Siblings_Spouses_Aboard == 1, "Siblings_Spouses_Aboard"] = 1
df.loc[df.Siblings_Spouses_Aboard >= 2, "Siblings_Spouses_Aboard"] = 2


'''
Feature Transformation for Parents_Children Aboard
'''
df.loc[df.Parents_Children_Aboard == 0, "Parents_Children_Aboard"] = 0
df.loc[df.Parents_Children_Aboard == 1, "Parents_Children_Aboard"] = 1
df.loc[df.Parents_Children_Aboard == 2, "Parents_Children_Aboard"] = 2
df.loc[df.Parents_Children_Aboard >= 3, "Parents_Children_Aboard"] = 3


'''
Code to generate training instances
'''

train_instances = []

for index, row in df.iterrows():
    
    temp = TrainingInstance(row['Pclass'], row['Sex'], row['Age'], row['Siblings_Spouses_Aboard'], row['Parents_Children_Aboard'], row['Fare'], row['Survived'])
    train_instances.append(temp)

In [8]:
'''
Code to compute p(y)
'''

num_survived = 0
num_deceased = 0
total = len(train_instances)

for elem in train_instances:
    
    if(elem.survived == 1):
        num_survived += 1
    else:
        num_deceased += 1


prob_survived = float(num_survived) / float(total)
prob_deceased = float(num_deceased) / float(total)

print('P(y) computation')
print('Survived Probability: ' + str(prob_survived))
print('Death Probability: ' + str(prob_deceased))



P(y) computation
Survived Probability: 0.3855693348365276
Death Probability: 0.6144306651634723


In [13]:
'''
Computations for p_class feature
'''

#p_class = 1
num_p_class_1_survived, num_p_class_1_deceased = compute_discrete_conditional(train_instances, 'p_class', 1)
prob_p_class_1_survived = float(num_p_class_1_survived + 1) / float(num_survived + 3)
prob_p_class_1_deceased = float(num_p_class_1_deceased + 1) / float(num_deceased + 3)

#p_class = 2
num_p_class_2_survived, num_p_class_2_deceased = compute_discrete_conditional(train_instances, 'p_class', 2)
prob_p_class_2_survived = float(num_p_class_2_survived + 1) / float(num_survived + 3)
prob_p_class_2_deceased = float(num_p_class_2_deceased + 1) / float(num_deceased + 3)

#p_class = 3
num_p_class_3_survived, num_p_class_3_deceased = compute_discrete_conditional(train_instances, 'p_class', 3)
prob_p_class_3_survived = float(num_p_class_3_survived + 1) / float(num_survived + 3)
prob_p_class_3_deceased = float(num_p_class_3_deceased + 1) / float(num_deceased + 3)


print('Computations for p_class')
print('P(p_class = 1 | survived): ' + str(prob_p_class_1_survived))
print('P(p_class = 1 | deceased): ' + str(prob_p_class_1_deceased))
print('P(p_class = 2 | survived): ' + str(prob_p_class_2_survived))
print('P(p_class = 2 | deceased): ' + str(prob_p_class_2_deceased))
print('P(p_class = 3 | survived): ' + str(prob_p_class_3_survived))
print('P(p_class = 3 | deceased): ' + str(prob_p_class_3_deceased))

Computations for p_class
P(p_class = 1 | survived): 0.39710144927536234
P(p_class = 1 | deceased): 0.1478102189781022
P(p_class = 2 | survived): 0.25507246376811593
P(p_class = 2 | deceased): 0.17883211678832117
P(p_class = 3 | survived): 0.34782608695652173
P(p_class = 3 | deceased): 0.6733576642335767


In [14]:
'''
Computations for sex_feature
'''

#Male
num_male_survived, num_male_deceased = compute_discrete_conditional(train_instances, 'sex', 0)
prob_male_given_survived = float(num_male_survived + 1) / float(num_survived + 2)
prob_male_given_deceased = float(num_male_deceased + 1) / float(num_deceased + 2)


#Female
num_female_survived, num_female_deceased = compute_discrete_conditional(train_instances, 'sex', 1)
prob_female_given_survived = float(num_female_survived + 1) / float(num_survived + 2)
prob_female_given_deceased = float(num_female_deceased + 1) / float(num_deceased + 2)

print('Computations for Sex')
print('P(sex = male | survived): ' + str(prob_male_given_survived))
print('P(sex = male | deceased): ' + str(prob_male_given_deceased))
print('P(sex = female | survived): ' + str(prob_female_given_survived))
print('P(sex = female | deceased): ' + str(prob_female_given_deceased))



Computations for Sex
P(sex = male | survived): 0.31976744186046513
P(sex = male | deceased): 0.850091407678245
P(sex = female | survived): 0.6802325581395349
P(sex = female | deceased): 0.14990859232175502


In [34]:
'''
Computations for age
'''

age_survived, age_deceased = compute_mean_estimate(train_instances, 'age')
mean_age_survived = age_survived / num_survived
mean_age_deceased = age_deceased / num_deceased


var_age_total_survived, var_age_total_deceased = compute_variance_estimate(train_instances, 'age', mean_age_survived, mean_age_deceased)

variance_age_survived = var_age_total_survived / (num_survived)
variance_age_deceased = var_age_total_deceased / (num_deceased)

print('Computations for age')
print('Mean age survived: ' + str(mean_age_survived))
print('Variance age survived: ' + str(variance_age_survived))
print('Mean age deceased: ' + str(mean_age_deceased))
print('Variance age deceased: '  + str(variance_age_deceased))




Computations for age
Mean age survived: 28.408391812865496
Variance age survived: 207.55457431431904
Mean age deceased: 30.13853211009174
Variance age deceased: 192.80879050584957


In [35]:
'''
Computations for siblings_Spouses
'''

num_sibsps_0_survived, num_sibsps_0_deceased = compute_discrete_conditional(train_instances, 'siblings_Spouses', 0)
prob_0_sibsps_survived = float(num_sibsps_0_survived + 1) / float(num_survived + 3)
prob_0_sibsps_deceased = float(num_sibsps_0_deceased + 1) / float(num_deceased + 3)

num_sibsps_1_survived, num_sibsps_1_deceased = compute_discrete_conditional(train_instances, 'siblings_Spouses', 1)
prob_1_sibsps_survived = float(num_sibsps_1_survived + 1) / float(num_survived + 3)
prob_1_sibsps_deceased = float(num_sibsps_1_deceased + 1) / float(num_deceased + 3)

num_sibsps_2_survived, num_sibsps_2_deceased = compute_discrete_conditional(train_instances, 'siblings_Spouses', 2)
prob_2_sibsps_survived = float(num_sibsps_2_survived + 1) / float(num_survived + 3)
prob_2_sibsps_deceased = float(num_sibsps_2_deceased + 1) / float(num_deceased + 3)

print('Computations for siblings_Spouses')
print('P(siblings_Spouses = 0 | survived): ' + str(prob_0_sibsps_survived))
print('P(siblings_Spouses = 0 | deceased): ' + str(prob_0_sibsps_deceased))
print('P(siblings_Spouses = 1 | survived): ' + str(prob_1_sibsps_survived))
print('P(siblings_Spouses = 1 | deceased): ' + str(prob_1_sibsps_deceased))
print('P(siblings_Spouses = 2 | survived): ' + str(prob_2_sibsps_survived))
print('P(siblings_Spouses = 2 | deceased): ' + str(prob_2_sibsps_deceased))

Computations for siblings_Spouses
P(siblings_Spouses = 0 | survived): 0.6115942028985507
P(siblings_Spouses = 0 | deceased): 0.7208029197080292
P(siblings_Spouses = 1 | survived): 0.32753623188405795
P(siblings_Spouses = 1 | deceased): 0.17883211678832117
P(siblings_Spouses = 2 | survived): 0.06086956521739131
P(siblings_Spouses = 2 | deceased): 0.10036496350364964


In [9]:
'''
Computations for parents_Children
'''

num_parchd_0_survived, num_parchd_0_deceased = compute_discrete_conditional(train_instances, 'parents_Children', 0)
prob_0_parchd_survived = float(num_parchd_0_survived + 1) / float(num_survived + 4)
prob_0_parchd_deceased = float(num_parchd_0_deceased + 1) / float(num_deceased + 4)

num_parchd_1_survived, num_parchd_1_deceased = compute_discrete_conditional(train_instances, 'parents_Children', 1)
prob_1_parchd_survived = float(num_parchd_1_survived + 1) / float(num_survived + 4)
prob_1_parchd_deceased = float(num_parchd_1_deceased + 1) / float(num_deceased + 4)

num_parchd_2_survived, num_parchd_2_deceased = compute_discrete_conditional(train_instances, 'parents_Children', 2)
prob_2_parchd_survived = float(num_parchd_2_survived + 1) / float(num_survived + 4)
prob_2_parchd_deceased = float(num_parchd_2_deceased + 1) / float(num_deceased + 4)

num_parchd_3_survived, num_parchd_3_deceased = compute_discrete_conditional(train_instances, 'parents_Children', 3)
prob_3_parchd_survived = float(num_parchd_3_survived + 1) / float(num_survived + 4)
prob_3_parchd_deceased = float(num_parchd_3_deceased + 1) / float(num_deceased + 4)


print('Computations for Parents_Children')
print('P(parents_Children = 0 | survived): ' + str(prob_0_parchd_survived))
print('P(parents_Children = 0 | deceased): ' + str(prob_0_parchd_deceased))
print('P(parents_Children = 1 | survived): ' + str(prob_1_parchd_survived))
print('P(parents_Children = 1 | deceased): ' + str(prob_1_parchd_deceased))
print('P(parents_Children = 2 | survived): ' + str(prob_2_parchd_survived))
print('P(parents_Children = 2 | deceased): ' + str(prob_2_parchd_deceased))
print('P(parents_Children = 3 | survived): ' + str(prob_3_parchd_survived))
print('P(parents_Children = 3 | deceased): ' + str(prob_3_parchd_deceased))

Computations for Parents_Children
P(parents_Children = 0 | survived): 0.6763005780346821
P(parents_Children = 0 | deceased): 0.8051001821493625
P(parents_Children = 1 | survived): 0.1907514450867052
P(parents_Children = 1 | deceased): 0.09836065573770492
P(parents_Children = 2 | survived): 0.11849710982658959
P(parents_Children = 2 | deceased): 0.07468123861566485
P(parents_Children = 3 | survived): 0.014450867052023121
P(parents_Children = 3 | deceased): 0.02185792349726776


In [41]:
'''
Computations for fare
'''

fare_survived, fare_deceased = compute_mean_estimate(train_instances, 'fare')
mean_fare_survived = fare_survived / num_survived
mean_fare_deceased = fare_deceased / num_deceased


var_fare_total_survived, var_fare_total_deceased = compute_variance_estimate(train_instances, 'fare', mean_fare_survived, mean_fare_deceased)

variance_fare_survived = var_fare_total_survived / (num_survived)
variance_fare_deceased = var_fare_total_deceased / (num_deceased)

print('Computations for fare')
print('Mean fare survived: ' + str(mean_fare_survived))
print('Variance fare survived: ' + str(variance_fare_survived))
print('Mean fare deceased: ' + str(mean_fare_deceased))
print('Variance fare deceased: '  + str(variance_fare_deceased))


Computations for fare
Mean fare survived: 48.39540760233917
Variance fare survived: 4422.191853811519
Mean fare deceased: 22.208584036697225
Variance fare deceased: 989.4331005100377


In [42]:
y_params = [prob_survived, prob_deceased]

p_class_params = [[prob_p_class_1_survived, prob_p_class_2_survived, prob_p_class_3_survived], 
                  [prob_p_class_1_deceased, prob_p_class_2_deceased, prob_p_class_3_deceased]]

sex_params = [[prob_male_given_survived, prob_female_given_survived ], [prob_male_given_deceased,prob_female_given_deceased]]

age_params = [[mean_age_survived, variance_age_survived], [mean_age_deceased,variance_age_deceased]]

sibling_spouse_params = [[prob_0_sibsps_survived, prob_1_sibsps_survived, prob_2_sibsps_survived ],
                        [prob_0_sibsps_deceased, prob_1_sibsps_deceased, prob_2_sibsps_deceased]]

parents_children_params = [[prob_0_parchd_survived, prob_1_parchd_survived, prob_2_parchd_survived, prob_0_parchd_survived],
                          [prob_0_parchd_deceased, prob_1_parchd_deceased, prob_2_parchd_deceased, prob_3_parchd_deceased]]

fare_params = [[mean_fare_survived, variance_fare_survived ], [mean_fare_deceased,variance_fare_deceased ]]




In [43]:
test_instance = TestInstance(1, 0, 22, 0, 0, 75.25)
print('Prediction: ' + str(prediction(test_instance, y_params, p_class_params, sex_params, age_params, sibling_spouse_params, 
                parents_children_params, fare_params)))

Computing probabilities associated with survival
P(y = survived): 0.3855693348365276
P(p_class | survived): 0.39710144927536234
P(sex | survived): 0.31976744186046513
P(age | survived): 0.025082936947809435
P(siblings_spouses_aboard | survived): 0.6115942028985507
P(parents_children_aboard | survived): 0.6763005780346821
P(fare | survived): 0.005529412598023891
P(survived | x): 2.8086569903656946e-06

computing probabilities associated with dying
P(y = deceased): 0.6144306651634723
P(p_class | deceased): 0.1478102189781022
P(sex | deceased): 0.850091407678245
P(age | deceased): 0.024196354856799572
P(siblings_spouses_aboard | deceased): 0.7208029197080292
P(parents_children_aboard | deceased): 0.8051001821493625
P(fare | deceased): 0.0030603570829236
P(deceased | x): 3.317656966341943e-06

Prediction: 0


In [None]:
'''
Code for Naive Bayes Cross Validation
'''

In [8]:
'''
balance_classes is a method that balance classes for each subset before applying cross-validation

Input: 1) the list of training instances

Output: a list that balances the instances


We know that 342 survived and 545 died
In CV we'll use 9 subsets with 89 samples and one with 86

For the 9 subsets with 89 samples we'll have:

34 instances of survived
55 instances of death

For the 1 subset with 86 samples we'll have 

36 instances of survived
50 instances of death

'''
def balance_classes(instances):
    
    survived = []
    deceased = []
    
    for elem in instances:
        
        if (elem.survived == 1):
            survived.append(elem)
        else:
            deceased.append(elem)
    
    
    temp = []
    survived_index = 0
    death_index = 0
    
    #First 9 susbsets
    for i in range(9):
        
        for j in range(34):
            temp.append(survived[survived_index])
            survived_index += 1
            
        
        for k in range(55):
            
            temp.append(deceased[death_index])
            death_index += 1
    
    
    
    #Last subset
    for j in range(36):
        temp.append(survived[survived_index])
        survived_index+=1
    
    for k in range(50):
        temp.append(deceased[death_index])
        death_index += 1
    
    
    return temp

In [9]:
'''
compute_accuracy is a function that computes Naive Bayes prediction accuracy over the test set

Input: 1) subset: a list of test instances

       2) y_params: the parameters for y, which includes P(y = survived) and P(y = deceased)
       
       3) p_class_params: the parameters for p_class, which includes P(p_class = 1 | survived), P(p_class = 1 | deceased),
              P(p_class = 2 | survived), P(p_class = 2 | deceased), P(p_class = 3 | survived) and P(p_class = 3 | deceased)
        
       4) sex_params: the paramaters for sex, which includes P(sex = male | survived), P(sex = male | deceased), 
                      P(sex = female, | survived), and P(sex = female | deceased)
    
       5) age_params: the parameters for age, which includes mean_age_survived, mean_age_deceased, variance_age_survived, and
                      variance_age_deceased
       
       6) siblings_spouses_params: the paramaters for siblings_spouses, which includes mean_sibsps_survived,
                                    variance_sibsps_survived, mean_sibsps_deceased, variance_sibsps_deceased
        
       7) parents_children_params: the paramters for parents_children, which includes mean_parchd_survived,
                                   variance_parchd_survived, mean_parchd_deceased, variance_parchd_deceased
        
       8) fare_params: the paramaters for fare, which includes mean_fare_survived, variance_fare_survived, 
                       mean_fare_deceased, variance_fare_deceased

       
Output: the prediction accuracy for the subset
'''
def compute_accuracy(subset, y_params,p_class_params,sex_params,age_params, sibling_spouse_params, parents_children_params,fare_params ):
    
    num_correct = 0
    
    for elem in subset:
        
        pred = prediction(elem, y_params,p_class_params,sex_params,age_params, sibling_spouse_params, parents_children_params,fare_params)
        
        if(pred == elem.survived):
            num_correct += 1
    
    
    return float(num_correct)/ float(len(subset))

In [13]:
'''
cross_validation is a method that performs 10-fold cross validation 

Input: 1) instances - the list of instances on which we will perform CV

Output: the average accuracy of 10-fold cross validation
'''
def cross_validation(instances):
    
    #Divide into 10 subsets 
    subset_1 = []
    subset_2 = []
    subset_3 = []
    subset_4 = []
    subset_5 = []
    subset_6 = []
    subset_7 = []
    subset_8 = []
    subset_9 = []
    subset_10 = []
    subset_list = []
    
    
    #Call balance_classes method (defined in next cell)
    balanced_dist = balance_classes(instances)
    
   
    #Since there are 887 samples we'll have 9 subsets that have 89 samples and 1 that has 86
    training_instance_list = []
   
    j = 0
    for i in range(10):
        
        temp = []
        
        for element in balanced_dist:
            temp.append(element)
        
        if(i == 9):
            for l in range(86):
                del temp[j]
        else:
            for l in range(89):
                del temp[j]
        
        training_instance_list.append(temp)
        j += 89
    
    
        
    for i in range(len(balanced_dist)):
        
        if(i < 89):
            subset_1.append(balanced_dist[i])
           
        elif(i < 178):
            subset_2.append(balanced_dist[i])
        
        elif(i < 267):
            subset_3.append(balanced_dist[i])
        
        elif(i < 356):
            subset_4.append(balanced_dist[i])
        
        elif(i < 445):
            subset_5.append(balanced_dist[i])
    
        elif(i < 534):
            subset_6.append(balanced_dist[i])
            
        elif(i < 623):
            subset_7.append(balanced_dist[i])

        elif(i < 712):
            subset_8.append(balanced_dist[i])
    
        elif(i < 801):
            subset_9.append(balanced_dist[i])
            
        else:
            subset_10.append(balanced_dist[i])
            
    
    subset_list.append(subset_1)
    subset_list.append(subset_2)
    subset_list.append(subset_3)
    subset_list.append(subset_4)
    subset_list.append(subset_5)
    subset_list.append(subset_6)
    subset_list.append(subset_7)
    subset_list.append(subset_8)
    subset_list.append(subset_9)
    subset_list.append(subset_10)
    
    total_accuracy = 0
    
    for i in range(10):
        
        num_survived = 0
        num_deceased = 0
        t_instances = training_instance_list[i]
        subset = subset_list[i]
        total = len(t_instances)
        
        for elem in t_instances:
    
            if(elem.survived == 1):
                num_survived += 1
            else:
                 num_deceased += 1


        prob_survived = float(num_survived) / float(total)
        prob_deceased = float(num_deceased) / float(total)

        #p_class = 1
        num_p_class_1_survived, num_p_class_1_deceased = compute_discrete_conditional(t_instances, 'p_class', 1)
        prob_p_class_1_survived = float(num_p_class_1_survived + 1) / float(num_survived + 3)
        prob_p_class_1_deceased = float(num_p_class_1_deceased + 1) / float(num_deceased + 3)

        #p_class = 2
        num_p_class_2_survived, num_p_class_2_deceased = compute_discrete_conditional(t_instances, 'p_class', 2)
        prob_p_class_2_survived = float(num_p_class_2_survived + 1) / float(num_survived + 3)
        prob_p_class_2_deceased = float(num_p_class_2_deceased + 1) / float(num_deceased + 3)

        #p_class = 3
        num_p_class_3_survived, num_p_class_3_deceased = compute_discrete_conditional(t_instances, 'p_class', 3)
        prob_p_class_3_survived = float(num_p_class_3_survived + 1) / float(num_survived + 3)
        prob_p_class_3_deceased = float(num_p_class_3_deceased + 1) / float(num_deceased + 3)
        
    
        #Male
        num_male_survived, num_male_deceased = compute_discrete_conditional(t_instances, 'sex', 0)
        prob_male_given_survived = float(num_male_survived + 1) / float(num_survived + 2)
        prob_male_given_deceased = float(num_male_deceased + 1) / float(num_deceased + 2)


        #Female
        num_female_survived, num_female_deceased = compute_discrete_conditional(t_instances, 'sex', 1)
        prob_female_given_survived = float(num_female_survived + 1) / float(num_survived + 2)
        prob_female_given_deceased = float(num_female_deceased + 1) / float(num_deceased + 2)
    
        #Age
        age_survived, age_deceased = compute_mean_estimate(t_instances, 'age')
        mean_age_survived = age_survived / num_survived
        mean_age_deceased = age_deceased / num_deceased


        var_age_total_survived, var_age_total_deceased = compute_variance_estimate(t_instances, 'age', mean_age_survived, mean_age_deceased)

        variance_age_survived = var_age_total_survived / (num_survived)
        variance_age_deceased = var_age_total_deceased / (num_deceased)
        
        
        #Siblings Spouses
        
        num_sibsps_0_survived, num_sibsps_0_deceased = compute_discrete_conditional(t_instances, 'siblings_Spouses', 0)
        prob_0_sibsps_survived = float(num_sibsps_0_survived + 1) / float(num_survived + 3)
        prob_0_sibsps_deceased = float(num_sibsps_0_deceased + 1) / float(num_deceased + 3)

        num_sibsps_1_survived, num_sibsps_1_deceased = compute_discrete_conditional(t_instances, 'siblings_Spouses', 1)
        prob_1_sibsps_survived = float(num_sibsps_1_survived + 1) / float(num_survived + 3)
        prob_1_sibsps_deceased = float(num_sibsps_1_deceased + 1) / float(num_deceased + 3)

        num_sibsps_2_survived, num_sibsps_2_deceased = compute_discrete_conditional(t_instances, 'siblings_Spouses', 2)
        prob_2_sibsps_survived = float(num_sibsps_2_survived + 1) / float(num_survived + 3)
        prob_2_sibsps_deceased = float(num_sibsps_2_deceased + 1) / float(num_deceased + 3)
        
        
        #Parents Children
        num_parchd_0_survived, num_parchd_0_deceased = compute_discrete_conditional(t_instances, 'parents_Children', 0)
        prob_0_parchd_survived = float(num_parchd_0_survived + 1) / float(num_survived + 4)
        prob_0_parchd_deceased = float(num_parchd_0_deceased + 1) / float(num_deceased + 4)

        num_parchd_1_survived, num_parchd_1_deceased = compute_discrete_conditional(t_instances, 'parents_Children', 1)
        prob_1_parchd_survived = float(num_parchd_1_survived + 1) / float(num_survived + 4)
        prob_1_parchd_deceased = float(num_parchd_1_deceased + 1) / float(num_deceased + 4)

        num_parchd_2_survived, num_parchd_2_deceased = compute_discrete_conditional(t_instances, 'parents_Children', 2)
        prob_2_parchd_survived = float(num_parchd_2_survived + 1) / float(num_survived + 4)
        prob_2_parchd_deceased = float(num_parchd_2_deceased + 1) / float(num_deceased + 4)

        num_parchd_3_survived, num_parchd_3_deceased = compute_discrete_conditional(t_instances, 'parents_Children', 3)
        prob_3_parchd_survived = float(num_parchd_3_survived + 1) / float(num_survived + 4)
        prob_3_parchd_deceased = float(num_parchd_3_deceased + 1) / float(num_deceased + 4)
    
    
        #Fare
        fare_survived, fare_deceased = compute_mean_estimate(t_instances, 'fare')
        mean_fare_survived = fare_survived / num_survived
        mean_fare_deceased = fare_deceased / num_deceased


        var_fare_total_survived, var_fare_total_deceased = compute_variance_estimate(t_instances, 'fare', mean_fare_survived, mean_fare_deceased)

        variance_fare_survived = var_fare_total_survived / (num_survived)
        variance_fare_deceased = var_fare_total_deceased / (num_deceased)
    
    
        y_params = [prob_survived, prob_deceased]

        p_class_params = [[prob_p_class_1_survived, prob_p_class_2_survived, prob_p_class_3_survived], 
                  [prob_p_class_1_deceased, prob_p_class_2_deceased, prob_p_class_3_deceased]]

        sex_params = [[prob_male_given_survived, prob_female_given_survived ], [prob_male_given_deceased,prob_female_given_deceased]]

        age_params = [[mean_age_survived, variance_age_survived], [mean_age_deceased,variance_age_deceased]]

        sibling_spouse_params = [[prob_0_sibsps_survived, prob_1_sibsps_survived, prob_2_sibsps_survived ],
                        [prob_0_sibsps_deceased, prob_1_sibsps_deceased, prob_2_sibsps_deceased]]

        parents_children_params = [[prob_0_parchd_survived, prob_1_parchd_survived, prob_2_parchd_survived, prob_0_parchd_survived],
                          [prob_0_parchd_deceased, prob_1_parchd_deceased, prob_2_parchd_deceased, prob_3_parchd_deceased]]

        fare_params = [[mean_fare_survived, variance_fare_survived ], [mean_fare_deceased,variance_fare_deceased ]]

        
        accuracy = compute_accuracy(subset, y_params,p_class_params,sex_params,age_params, sibling_spouse_params, parents_children_params,fare_params)
        
        total_accuracy += accuracy
    
    print('Total Accuracy: ' + str(total_accuracy/10))

In [18]:
cross_validation(train_instances)

Total Accuracy: 0.7565194669453879
