# Naive Bayes

Referenece : https://www.freecodecamp.org/news/how-naive-bayes-classifiers-work/

In [11]:
class NaiveBayesClassifier:
    
    def __init__(self, X, y):
        self.X, self.y = X, y 
        self.N = len(self.X) # Training set length
        self.dim = len(self.X[0]) # Dimension of the vector of features

        self.attrs = [[] for _ in range(self.dim)] 

        self.output_dom = {} # Output classes with the number of ocurrences in the training set. In this case we have only 2 classes

        self.data = []
        
        
        for i in range(len(self.X)):
            for j in range(self.dim):
                if not self.X[i][j] in self.attrs[j]:
                    self.attrs[j].append(self.X[i][j])
                    
            
            if not self.y[i] in self.output_dom.keys():
                self.output_dom[self.y[i]] = 1
            
            else:
                self.output_dom[self.y[i]] += 1
                
            self.data.append([self.X[i], self.y[i]])
            
            

    def classify(self, entry):

        solve = None # Final result
        max_arg = -1 # partial maximum

        for y in self.output_dom.keys():

            prob = self.output_dom[y]/self.N # P(y)

            for i in range(self.dim):
                cases = [x for x in self.data if x[0][i] == entry[i] and x[1] == y] # all rows with Xi = xi
                n = len(cases)
                prob *= n/self.N # P *= P(Xi = xi)
                
            # if we have a greater prob for this output than the partial maximum...
            if prob > max_arg:
                max_arg = prob
                solve = y

        return solve

In [12]:
import pandas as pd

data = pd.read_csv('train.csv')

print(data.head())

   PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
0            1         0       3  ...   7.2500   NaN         S
1            2         1       1  ...  71.2833   C85         C
2            3         1       3  ...   7.9250   NaN         S
3            4         1       1  ...  53.1000  C123         S
4            5         0       3  ...   8.0500   NaN         S

[5 rows x 12 columns]


In [13]:
y = list(map(lambda v: 'yes' if v == 1 else 'no', data['Survived'].values)) 

X = data[['Pclass', 'Sex', 'Age']].values # features values

In [15]:
print(len(y)) # >> 887

# We'll take 600 examples to train and the rest to the validation process
y_train = y[:700]
y_val = y[700:]

X_train = X[:700]
X_val = X[700:]

891


In [16]:
## Creating the Naive Bayes Classifier instance with the training data

nbc = NaiveBayesClassifier(X_train, y_train)


total_cases = len(y_val) # size of validation set

# Well classified examples and bad classified examples
good = 0
bad = 0

for i in range(total_cases):
    predict = nbc.classify(X_val[i])
#     print(y_val[i] + ' --------------- ' + predict)
    if y_val[i] == predict:
        good += 1
    else:
        bad += 1

print('TOTAL EXAMPLES:', total_cases)
print('RIGHT:', good)
print('WRONG:', bad)
print('ACCURACY:', good/total_cases)

TOTAL EXAMPLES: 191
RIGHT: 148
WRONG: 43
ACCURACY: 0.774869109947644
