Import Statements

In [None]:
from sklearn.naive_bayes import CategoricalNB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, RocCurveDisplay
from sklearn.model_selection import cross_val_score, KFold
import random
from CompletedMLAlgorithms.NaiveBayes import CategoricalNaiveBayes

Create the Dataset

In [29]:
# Define the possible values for each column
weather_values = ["Sunny", "Overcast", "Rain"]
temp_values = ["Hot", "Mild", "Cool"]
humidity_values = ["High", "Normal"]
windy_values = ["True", "False"]

# Set the desired number of samples
num_samples = 5000

# Generate random data for features
weather_data = random.choices(weather_values, k=num_samples)
temp_data = random.choices(temp_values, k=num_samples)
humidity_data = random.choices(humidity_values, k=num_samples)
windy_data = random.choices(windy_values, k=num_samples)

# Create the target variable 'Play' with a clear logical relationship
# A good relationship for Naive Bayes would be:
# - If it's Overcast, Play is always 'Yes'.
# - If it's Rainy and not Windy, Play is 'Yes'.
# - Otherwise, it's a mix of 'Yes' and 'No' to create some variance.

play_data = []
for i in range(num_samples):
    if weather_data[i] == "Overcast":
        play_data.append("Yes")
    elif weather_data[i] == "Rain" and windy_data[i] == "False":
        play_data.append("Yes")
    elif weather_data[i] == "Sunny" and windy_data[i] == "False":
        play_data.append("No")
    else:
        play_data.append(random.choice(["Yes", "No"]))

# Create the DataFrame
df = pd.DataFrame(
    {
        "Weather": weather_data,
        "Temperature": temp_data,
        "Humidity": humidity_data,
        "Windy": windy_data,
        "Play": play_data,
    }
)

In [30]:
encoder = OrdinalEncoder(dtype=int)
df_1 = encoder.fit_transform(df)
df = pd.DataFrame(data=df_1, columns=df.columns)

Seperate the Train and Test Data

In [31]:
# Train-test split
df, testdf = train_test_split(df, test_size=0.2, random_state=41, stratify=df["Play"])

# stratify=df["Play"]

df_target = df["Play"]
testdf_target = testdf["Play"]

df = df.drop(columns=["Play"])
testdf = testdf.drop(columns=["Play"])

print("=== TRAIN DATA ===")
print(df)

print("\n=== TEST DATA ===")
print(testdf)

=== TRAIN DATA ===
      Weather  Temperature  Humidity  Windy
3448        1            0         1      0
3188        1            2         1      1
1964        0            2         1      0
1758        2            1         0      0
3242        0            2         0      1
...       ...          ...       ...    ...
1929        1            1         0      0
4212        1            0         1      1
2576        0            1         0      0
108         2            0         0      1
1439        1            2         0      0

[4000 rows x 4 columns]

=== TEST DATA ===
      Weather  Temperature  Humidity  Windy
3684        0            1         0      0
3846        0            1         1      0
2475        1            0         0      1
721         0            0         1      1
4743        0            2         0      1
...       ...          ...       ...    ...
743         1            0         0      1
3676        0            2         0      1
1258        2

Cross-Validation

In [32]:
model1 = CategoricalNB(alpha=1)

# cv = KFold(n_splits=5, shuffle=True, random_state=69)
# print(cross_val_score(model1, df, df_target, cv=cv, scoring="accuracy"))

Final Prediction

In [33]:
model1.fit(df, df_target)

In [34]:
model2 = CategoricalNaiveBayes(smoothing_param=1)
model2.fit(df, df_target)

In [35]:
import numpy as np

print(accuracy_score(model1.predict(testdf), testdf_target))
print(f1_score(model1.predict(testdf), testdf_target))
print(roc_auc_score(model1.predict(testdf), testdf_target))

print(accuracy_score(model2.predict(testdf), testdf_target))
print(f1_score(model2.predict(testdf), testdf_target))
print(roc_auc_score(model2.predict(testdf), testdf_target))


print(np.unique(model2.predict(testdf), return_counts=True))

print(model2.lookup_table)

print(len(df))

0.839
0.8766283524904215
0.8190993788819876
0.839
0.8766283524904215
0.8190993788819876
(array([0, 1]), array([356, 644]))
[[[np.float64(0.0007347538574577516), np.float64(0.2637766348273328), np.float64(0.7354886113152094)], [np.float64(0.49073724007561437), np.float64(0.37353497164461247), np.float64(0.13572778827977316)]], [[np.float64(0.334313005143277), np.float64(0.328434974283615), np.float64(0.337252020573108)], [np.float64(0.3285444234404537), np.float64(0.33119092627599245), np.float64(0.34026465028355385)]], [[np.float64(0.49264705882352944), np.float64(0.5073529411764706)], [np.float64(0.4977307110438729), np.float64(0.5022692889561271)]], [[np.float64(0.4904411764705882), np.float64(0.5095588235294117)], [np.float64(0.48411497730711045), np.float64(0.5158850226928896)]]]
4000


In [36]:
print(model1.predict(testdf))
print(accuracy_score(model2.predict(testdf), testdf_target))
print(model2.predict(testdf))
print(testdf_target.to_numpy())

[1 1 1 1 1 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 0 1 1 0
 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 0
 1 1 1 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 1 0 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1
 1 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 1 0 0 0 1 1
 1 0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 0 1
 0 0 1 1 0 0 0 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 1 0 0 1
 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1
 1 0 0 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 0 0 0 0 1 0 0 0 1 0 0 0
 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1
 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 0 0
 0 0 1 0 0 0 0 0 1 1 0 1 