### Implementing Naive Bayes Classifier

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_table('weather1.txt') #reading the weather data

In [5]:
df.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Cloudy,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [6]:
#splitting target columns
X = df.loc[:,df.columns!='Play']
y = df['Play']

In [7]:
df.head(20)

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Cloudy,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Cloudy,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [8]:
features = X.columns
print(features)

Index(['Outlook', 'Temp', 'Humidity', 'Windy'], dtype='object')


In [122]:
#finding prior probabilities and likelihood
cond_prob = {}
pred_prior_prob = {}
for feature in features:
    for feat_val in sorted(np.unique(X[feature])):
        pred_prior_prob[(feature, feat_val)] = len(X[(X[feature]==feat_val)])/len(X)
        for outcome in np.unique(y):
            outcome_data_count = len(y[y==outcome])
            cond_prob[(outcome, (feature, feat_val))] = len(X[(X[feature]==feat_val)&(y==outcome)])/outcome_data_count

In [132]:
print(cond_prob) #finding likelihood

{('No', ('Outlook', 'Cloudy')): 0.0, ('Yes', ('Outlook', 'Cloudy')): 0.4444444444444444, ('No', ('Outlook', 'Rain')): 0.4, ('Yes', ('Outlook', 'Rain')): 0.3333333333333333, ('No', ('Outlook', 'Sunny')): 0.6, ('Yes', ('Outlook', 'Sunny')): 0.2222222222222222, ('No', ('Temp', 'Cool')): 0.2, ('Yes', ('Temp', 'Cool')): 0.3333333333333333, ('No', ('Temp', 'Hot')): 0.4, ('Yes', ('Temp', 'Hot')): 0.2222222222222222, ('No', ('Temp', 'Mild')): 0.4, ('Yes', ('Temp', 'Mild')): 0.4444444444444444, ('No', ('Humidity', 'High')): 0.8, ('Yes', ('Humidity', 'High')): 0.3333333333333333, ('No', ('Humidity', 'Normal')): 0.2, ('Yes', ('Humidity', 'Normal')): 0.6666666666666666, ('No', ('Windy', 'Strong')): 0.6, ('Yes', ('Windy', 'Strong')): 0.3333333333333333, ('No', ('Windy', 'Weak')): 0.4, ('Yes', ('Windy', 'Weak')): 0.6666666666666666}


In [124]:
pred_prior_prob

{('Outlook', 'Cloudy'): 0.2857142857142857,
 ('Outlook', 'Rain'): 0.35714285714285715,
 ('Outlook', 'Sunny'): 0.35714285714285715,
 ('Temp', 'Cool'): 0.2857142857142857,
 ('Temp', 'Hot'): 0.2857142857142857,
 ('Temp', 'Mild'): 0.42857142857142855,
 ('Humidity', 'High'): 0.5,
 ('Humidity', 'Normal'): 0.5,
 ('Windy', 'Strong'): 0.42857142857142855,
 ('Windy', 'Weak'): 0.5714285714285714}

In [125]:
#finding prior Probability of two classes
class_prior_prob = {outcome: len(y[y==outcome])/len(y) for outcome in np.unique(y)}
class_prior_prob

{'No': 0.35714285714285715, 'Yes': 0.6428571428571429}

In [127]:
#finding posterior probability 
def post_prob(outcome, input_values):
    class_prior = class_prior_prob[outcome]
    cp = 1
    pred_prior = 1
    for i in range(len(features)):
        cp *= cond_prob[(outcome, (features[i], input_values[i]))]
        pred_prior *= pred_prior_prob[(features[i], input_values[i])]
    print("Numerator: ", cp*class_prior)
    return(cp*class_prior/pred_prior)


In [129]:
post_prob("Yes", ['Rain', 'Hot', 'High', 'Weak'])

Numerator:  0.010582010582010581


0.36296296296296293

In [130]:
post_prob("No", ['Rain', 'Hot', 'High', 'Weak'])

Numerator:  0.01828571428571429


0.6272000000000002

In [134]:
if post_prob("Yes", ['Rain', 'Hot', 'High', 'Weak']) > post_prob("No", ['Rain', 'Hot', 'High', 'Weak']):
    print("Condition is fit for playing")
else:
    print("The given Condition is not fit for playing")

Numerator:  0.010582010582010581
Numerator:  0.01828571428571429
The given Condition is not fit for playing
