<a href="https://colab.research.google.com/github/NugPath23/DIF62130_Kelas-B_25_2311531008/blob/main/praktikum3/BernoulliNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bernoulli Naive Bayes

**Step 1. Import library**

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

#load dataset
dataset_dict = {
    'Outlook': ['sunny', 'sunny', 'overcast', 'rainy', 'rainy', 'rainy', 'overcast', 'sunny', 'sunny', 'rainy', 'sunny', 'overcast', 'overcast', 'rainy', 'sunny', 'overcast', 'rainy', 'sunny', 'sunny', 'rainy', 'overcast', 'rainy', 'sunny', 'overcast', 'sunny', 'overcast', 'rainy', 'overcast'],
    'Temperature': [85.0, 80.0, 83.0, 70.0, 68.0, 65.0, 64.0, 72.0, 69.0, 75.0, 75.0, 72.0, 81.0, 71.0, 81.0, 74.0, 76.0, 78.0, 82.0, 67.0, 85.0, 73.0, 88.0, 77.0, 79.0, 80.0, 66.0, 84.0],
    'Humidity': [85.0, 90.0, 78.0, 96.0, 80.0, 70.0, 65.0, 95.0, 70.0, 80.0, 70.0, 90.0, 75.0, 80.0, 88.0, 92.0, 85.0, 75.0, 92.0, 90.0, 85.0, 88.0, 65.0, 70.0, 60.0, 95.0, 70.0, 78.0],
    'Wind': [False, True, False, False, False, True, True, False, False, False, True, True, False, True, True, False, False, True, False, True, True, False, True, False, False, True, False, False],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(dataset_dict)

df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,sunny,85.0,85.0,False,No
1,sunny,80.0,90.0,True,No
2,overcast,83.0,78.0,False,Yes
3,rainy,70.0,96.0,False,Yes
4,rainy,68.0,80.0,False,Yes


**Step 2. data preprocessing**

In [36]:
#one hot encode 'outlook' column
df = pd.get_dummies(df, columns=['Outlook'], prefix='', prefix_sep='', dtype=int)

#convert 'windy' (bool) and 'play' (binary) columns to binary indicators
df['Wind'] = df['Wind'].astype(int)
df['Play'] = (df['Play'] == 'Yes').astype(int)

df.head()

Unnamed: 0,Temperature,Humidity,Wind,Play,overcast,rainy,sunny
0,85.0,85.0,0,0,0,0,1
1,80.0,90.0,1,0,0,0,1
2,83.0,78.0,0,1,1,0,0
3,70.0,96.0,0,1,0,1,0
4,68.0,80.0,0,1,0,1,0


Encoding


In [37]:
#one hot encode the categorized columns and drop them after
#define cetegories for 'temperature' and 'humidity' for dataframe
df['Temperature'] = pd.cut(df['Temperature'], bins=[0, 80, 100], labels=['Warm', 'Hot'])
df['Humidity'] = pd.cut(df['Humidity'], bins=[0, 75, 100], labels=['Dry', 'Humid'])

#one hot encode the categorize columns
one_hot_columns = pd.get_dummies(df[['Temperature', 'Humidity']], drop_first=True, dtype=int)

# drop the categorized columns from df
df = df.drop(['Temperature', 'Humidity'], axis=1)

#concatenate the one-hot encoded columns with the original dataframes
df = pd.concat([df, one_hot_columns], axis=1)

df.head()

Unnamed: 0,Wind,Play,overcast,rainy,sunny,Temperature_Hot,Humidity_Humid
0,0,0,0,0,1,1,1
1,1,0,0,0,1,0,1
2,0,1,1,0,0,1,1
3,0,1,0,1,0,0,1
4,0,1,0,1,0,0,1


In [38]:
# split data into training and testing sets
X, y = df.drop(columns='Play'), df['Play']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=False)
#X_train.head()

**Main Mechanism**

- Hitung probabilitas setiap kelas dalam data pelatihan

- untuk setiap fitur dan kelas, hitung probabilitas fitur tersebut bernilai 1 dan 0 berdasarkan kelasnya

- untuk data baru: utk setiap kelas, kalikan probabilitasnya dengan probabilitas setiap nilai fitur (0 dan 1) untuk kelas tersebut

- prediksi kelas dengan probabilitas hasil tertinggi

# Training Steps

In [39]:
from fractions import Fraction

def calc_target_prob(attr):
  total_counts = attr.value_counts().sum()
  prob_series = attr.value_counts().apply(lambda x: Fraction(x, total_counts).limit_denominator())
  return prob_series

print(calc_target_prob(y_train))

Play
1    11/19
0     8/19
Name: count, dtype: object


**Feature Probability Calculation**

In [44]:
def hitung_rasio(kejadian, total):
  return kejadian / total

temp_one = hitung_rasio(X_train['Temperature_hot'].value_counts()[1], X_train.value_counts().sum())
print(f"Ratio of 1s in 'Temperature_hot' column: {temp_one}")
temp_zero = hitung_rasio(X_train['Temperature_hot'].value_counts()[0], X_train.value_counts().sum())
print(f"Ratio of 0s in 'Temperature_hot' column: {temp_zero}")
print(" ")

humid_one = hitung_rasio(X_train['Humidity_humid'].value_counts()[1], X_train.value_counts().sum())
print(f"Ratio of 1s in 'Humidity_hot' column: {humid_one}")
humid_zero = hitung_rasio(X_train['Humidity_humid'].value_counts()[0], X_train.value_counts().sum())
print(f"Ratio of 0s in 'Humidity_hot' column: {humid_zero}")
print(" ")

wind_one = hitung_rasio(X_train['Wind'].value_counts()[1], X_train.value_counts().sum())
print(f"Ratio of 1s in 'Wind' column: {wind_one}")
wind_zero = hitung_rasio(X_train['Wind'].value_counts()[0], X_train.value_counts().sum())
print(f"Ratio of 0s in 'Wind' column: {wind_zero}")
print(" ")

overcast_one = hitung_rasio(X_train['overcast'].value_counts()[1], X_train.value_counts().sum())
print(f"Ratio of 1s in 'overcast' column: {overcast_one}")
overcast_zero = hitung_rasio(X_train['overcast'].value_counts()[0], X_train.value_counts().sum())
print(f"Ratio of 0s in 'overcast' column: {overcast_zero}")
print(" ")

rainy_one = hitung_rasio(X_train['rainy'].value_counts()[1], X_train.value_counts().sum())
print(f"Ratio of 1s in 'rainy' column: {rainy_one}")
rainy_zero = hitung_rasio(X_train['rainy'].value_counts()[0], X_train.value_counts().sum())
print(f"Ratio of 0s in 'rainy' column: {rainy_zero}")
print(" ")

sunny_one = hitung_rasio(X_train['sunny'].value_counts()[1], X_train.value_counts().sum())
print(f"Ratio of 1s in 'sunny' column: {sunny_one}")
sunny_zero = hitung_rasio(X_train['sunny'].value_counts()[0], X_train.value_counts().sum())
print(f"Ratio of 0s in 'sunny' column: {sunny_zero}")

KeyError: 'Temperature_hot'