# Naive Bayes - Golf Dataset

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import GaussianNB


In [9]:
# Loading the dataset
df = pd.read_csv("data/golf-dataset.csv")
df

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


## Bayes Theorem

$P(A|B) = \dfrac{P(B|A)P(A)}{P(B)}$

The goal is to predict for:

Should we play golf if:
- **Outlook = Sunny,  Temperature = Cool,  Humidity = High,  Windy = True**

Mathematically:
- **P(Yes | Outlook = Sunny, Temperature = Cool, Humidity = High, Windy = True)**

### Checking the types of variables

In [8]:
print(df.dtypes)

Outlook      object
Temp         object
Humidity     object
Windy          bool
Play Golf    object
dtype: object


## Naives Bayes Manually

In [16]:
# Calculating the probability of playing or not playing golf separatly
p_yes = df[df["Play Golf"]== "Yes"].shape[0] / df.shape[0]
p_no = df[df["Play Golf"]== "No"].shape[0] / df.shape[0]


p_yes, p_no

(0.6428571428571429, 0.35714285714285715)

##### Calculating the probs manually
Example: P(Outlook = Sunny | play = Yes)
- p_outlook_sunny_yes = 3/9
- p_temp_cool_yes = 3/9
- p_humidity_high_yes = 3/9
- p_wind_true_yes = 3/9

P(Outlook = Sunny | play = No)

- p_outlook_sunny_no = 2/5
- p_temp_cool_no = 1/5
- p_humidity_high_no = 4/5
- p_wind_true_no = 3/5

### Calculating the probs by using pandas


In [26]:
# P(outlook == sunny | play == yes)
P_outlook_sunny_yes =  df[(df["Play Golf"] == "Yes") & (df["Outlook"] == "Sunny")].shape[0] / df[df["Play Golf"] == "Yes"].shape[0]
print("P(Outlook == Sunny | Play == Yes): ", P_outlook_sunny_yes)

# P(temp == cool | play == yes)
P_temp_cool_yes  = df[(df["Play Golf"] == "Yes") & (df["Temp"] == "Cool")].shape[0] / df[df["Play Golf"] == "Yes"].shape[0] 
print("P(Temp == Cool | Play == Yes): ", P_temp_cool_yes)

# P(humidity == high | play == yes)
P_humidity_high_yes  = df[(df["Play Golf"] == "Yes") & (df["Humidity"] == "High")].shape[0] / df[df["Play Golf"] == "Yes"].shape[0] 
print("P(Humidity == High | Play == Yes): ", P_humidity_high_yes)

# P(windy == True | play == yes)
P_windy_true_yes  = df[(df["Play Golf"] == "Yes") & (df["Windy"] == True)].shape[0] / df[df["Play Golf"] == "Yes"].shape[0] 
print("P(Windy == True | Play == Yes): ", P_windy_true_yes)

P(Outlook == Sunny | Play == Yes):  0.3333333333333333
P(Temp == Cool | Play == Yes):  0.3333333333333333
P(Humidity == High | Play == Yes):  0.3333333333333333
P(Windy == True | Play == Yes):  0.3333333333333333


In [23]:
# P(outlook == sunny | play == no)
P_outlook_sunny_no =  df[(df["Play Golf"] == "No") & (df["Outlook"] == "Sunny")].shape[0] / df[df["Play Golf"] == "No"].shape[0]
print("P(Outlook == Sunny | Play == No) :", P_outlook_sunny_no)

# P(temp == cool | play ==no)
P_temp_cool_no  = df[(df["Play Golf"] == "No") & (df["Temp"] == "Cool")].shape[0] / df[df["Play Golf"] == "No"].shape[0] 
print("P(Temp == Cool | Play == No) :", P_temp_cool_no)

# P(humidity == high | play == no)
P_humidity_high_no  = df[(df["Play Golf"] == "No") & (df["Humidity"] == "High")].shape[0] / df[df["Play Golf"] == "No"].shape[0] 
print("P(Humidity == High | Play == No) :", P_humidity_high_no)

# P(windy == True | play == no)
P_windy_true_no = df[(df["Play Golf"] == "No") & (df["Windy"] == True)].shape[0] / df[df["Play Golf"] == "No"].shape[0] 
print("P(Windy == True | Play == No) :", P_windy_true_no)

P(Outlook == Sunny | Play == No) : 0.4
P(Temp == Cool | Play == No) : 0.2
P(Humidity == High | Play == No) : 0.8
P(Windy == True | Play == No) : 0.6




$P(A|B) = \dfrac{P(B|A)P(A)}{P(B)}$

$P(Yes|Sunny, Cool, High, Windy) = \dfrac{P(Sunny, Cool, High, Windy|Yes)P(Yes)}{P(Sunny, Cool, High, Windy)}$

$P(No|Sunny, Cool, High, Windy) = \dfrac{P(Sunny, Cool, High, Windy|Yes)P(No)}{P(Sunny, Cool, High, Windy)}$

In [27]:
# P(Sunny, Cool, High, Windy | Yes)
p_sunny_cool_high_windy_yes = P_outlook_sunny_yes * P_temp_cool_yes * P_humidity_high_yes * P_windy_true_yes

p_sunny_cool_high_windy_yes

0.012345679012345678

In [29]:
# P(Sunny, Cool, High, Windy | No)

p_sunny_cool_high_windy_no = P_outlook_sunny_no * P_temp_cool_no * P_humidity_high_no * P_windy_true_no
p_sunny_cool_high_windy_no

0.03840000000000001

In [33]:
# normalizing the resulted value by
# P(Outlook = Sunny, Temperature = Cool, Humidity = High, Windy = True)

p_sunny = df[df["Outlook"] == "Sunny"].shape[0] / df.shape[0]
p_cool = df[df["Temp"] == "Cool"].shape[0] / df.shape[0]
p_high = df[df["Humidity"] == "High"].shape[0] / df.shape[0]
p_windy = df[df["Windy"] == True].shape[0] / df.shape[0]

# Calculating the denominator
p_sunny_cool_high_windy = p_sunny * p_cool * p_high * p_windy
p_sunny_cool_high_windy

0.35714285714285715 0.2857142857142857 0.5 0.42857142857142855


0.021865889212827987

In [34]:
# Applying the Bayes Theorem
p_final_yes = p_sunny_cool_high_windy_yes * p_yes / p_sunny_cool_high_windy
p_final_yes

0.36296296296296293

In [35]:
p_final_no = p_sunny_cool_high_windy_no * p_no / p_sunny_cool_high_windy
p_final_no

0.6272000000000002

### Naives Bayes by using SciKit Learn

In [44]:
# We need to convert the categorical variables into codes
columns = ['Outlook', 'Temp', 'Humidity', 'Windy', 'Play Golf']

# encoders = {"Outlook": le_object, "Temp": le2_object}
encoders = {}

# Creating label enconders for each variable
for col in columns:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])
    
df

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,1,1,0,0,0
1,1,1,0,1,0
2,0,1,0,0,1
3,2,2,0,0,1
4,2,0,1,0,1
5,2,0,1,1,0
6,0,0,1,1,1
7,1,2,0,0,0
8,1,0,1,0,1
9,2,2,1,0,1


In [37]:
# Showing the encoder for each variable
encoders

{'Outlook': LabelEncoder(),
 'Temp': LabelEncoder(),
 'Humidity': LabelEncoder(),
 'Windy': LabelEncoder(),
 'Play Golf': LabelEncoder()}

In [38]:
# Insatitating the Naives Bayes model in this case we used Gaussian
nb = GaussianNB()

# Partitioning the data into features and target
X = df.drop(columns=["Play Golf"])
y = df["Play Golf"]

# Fitting the model
nb.fit(X, y)

nb

In [39]:
# Let's use a new data to test the model
data = ['Sunny', 'Cool', 'High', 'True']
columns = ['Outlook', 'Temp', 'Humidity', 'Windy']

# Converting into a DataFrame
df_test = pd.DataFrame(data=[data], columns=columns)
df_test

Unnamed: 0,Outlook,Temp,Humidity,Windy
0,Sunny,Cool,High,True


In [40]:
# Applying the enconders to each variable
for col in columns:
    df_test[col] = encoders[col].transform(df_test[col])
    
df_test

Unnamed: 0,Outlook,Temp,Humidity,Windy
0,2,0,0,1


In [41]:
# Predicting by using the model
prediction = nb.predict(df_test)
prediction

array([0])

In [42]:
# Transforming the results into a raw data
encoders["Play Golf"].inverse_transform(prediction)

array(['No'], dtype=object)

In [43]:
# Checking the probability result
prediction_proba = nb.predict_proba(df_test)
prediction_proba

array([[0.80106965, 0.19893035]])