In [57]:
import numpy as np
import pandas as pd
from collections import defaultdict
import math


In [58]:
df = pd.read_csv('weather_forecast.csv')


In [59]:
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [60]:
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")


Rows: 14, Columns: 5


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temperature  14 non-null     object
 2   Humidity     14 non-null     object
 3   Windy        14 non-null     object
 4   Play         14 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes


In [62]:
print("\nDistinct values for each feature:")
print("+--------------+----------------------+-------+")
print("| Features     | Distinct Values       | Count |")
print("+--------------+----------------------+-------+")

for col in df.columns[:-1]:
    unique_vals = df[col].unique()
    count = len(unique_vals)
    values = ", ".join(map(str, unique_vals))
    print(f"| {col.ljust(10)}   | {values.ljust(20)} | {count} |")

print("+------------+----------------------+-------+")



Distinct values for each feature:
+--------------+----------------------+-------+
| Features     | Distinct Values       | Count |
+--------------+----------------------+-------+
| Outlook      | Sunny, Overcast, Rain | 3 |
| Temperature   | Hot, Mild, Cool      | 3 |
| Humidity     | High, Normal         | 2 |
| Windy        | Weak, Strong         | 2 |
+------------+----------------------+-------+


In [63]:
print(df[df.columns[-1]].value_counts())

Play
Yes    9
No     5
Name: count, dtype: int64


In [64]:

def get_count(data, key_func):
    """Helper function to count occurrences based on a given key function."""
    counts = defaultdict(int)
    for item in data:
        counts[key_func(item)] += 1
    return counts

def NaiveBayes(df):
    N, cols = df.shape
    f_names = df.columns[:-1]  # All features except last column
    l_name = df.columns[-1]  # Last column is the target variable (label)

    # Compute class label prior probabilities
    labels = df[l_name].value_counts().to_dict()
    cl_p = {label: count / N for label, count in labels.items()}

    print("\nClass-label Prior Probabilities:")
    for label, count in labels.items():
        print(f"P({label}) = {count}/{N} = {round(count/N, 4)}")

    def fit():
        model = {}
        print("\nFeature-wise Conditional Probabilities (Likelihood):")

        for f_name in f_names:
            print(f"\nFeature: {f_name}")
            vals = df[[f_name, l_name]].values
            f_count = get_count(vals, lambda item: str(item[0]) + str(item[1]))  # String keys for dict

            for feature in df[f_name].unique():
                for label, count in labels.items():
                    aib = f_count.get(str(feature) + str(label), 0)  # Get count of feature given label
                    prob = aib / count if count != 0 else 0
                    model[(f_name, feature, label)] = prob
                    print(f"P({f_name} = {feature} | {l_name} = {label}) = {round(prob, 4)}")

        def predict(x_qs):
            sigma_cl = {}
            for label, prior in cl_p.items():
                sigma = math.log(prior)  # Start with prior probability
                for idx, x_q in enumerate(x_qs):
                    cp = model.get((f_names[idx], x_q, label), 1e-10)  # Avoid zero probability
                    sigma += math.log(cp)
                sigma_cl[label] = sigma
            return max(sigma_cl, key=sigma_cl.get)  # Return label with highest probability

        return predict

    return fit

# Load the dataset from CSV
df = pd.read_csv('weather_forecast.csv')

# Train Naïve Bayes Model
fit = NaiveBayes(df)
predict = fit()

# Example test case
test_sample = ['Rain', 'Cool']  # Modify based on your dataset's features
predicted_label = predict(test_sample)
print("\nPredicted Class Label:", predicted_label)



Class-label Prior Probabilities:
P(Yes) = 9/14 = 0.6429
P(No) = 5/14 = 0.3571

Feature-wise Conditional Probabilities (Likelihood):

Feature: Outlook
P(Outlook = Sunny | Play = Yes) = 0.2222
P(Outlook = Sunny | Play = No) = 0.6
P(Outlook = Overcast | Play = Yes) = 0.4444
P(Outlook = Overcast | Play = No) = 0.0
P(Outlook = Rain | Play = Yes) = 0.3333
P(Outlook = Rain | Play = No) = 0.4

Feature: Temperature
P(Temperature = Hot | Play = Yes) = 0.2222
P(Temperature = Hot | Play = No) = 0.4
P(Temperature = Mild | Play = Yes) = 0.4444
P(Temperature = Mild | Play = No) = 0.4
P(Temperature = Cool | Play = Yes) = 0.3333
P(Temperature = Cool | Play = No) = 0.2

Feature: Humidity
P(Humidity = High | Play = Yes) = 0.3333
P(Humidity = High | Play = No) = 0.8
P(Humidity = Normal | Play = Yes) = 0.6667
P(Humidity = Normal | Play = No) = 0.2

Feature: Windy
P(Windy = Weak | Play = Yes) = 0.6667
P(Windy = Weak | Play = No) = 0.4
P(Windy = Strong | Play = Yes) = 0.3333
P(Windy = Strong | Play = No) = 

In [47]:
print("\nQuery point:", query_point)
print("Output:", predicted_label)


Query point: ['Sunny', 'Cool', 'Low', 'Strong']
Output: Yes


In [53]:
import math
def calculate_log_probabilities(query_point):
    log_probs = {}

    for cls in class_counts.keys():
        log_prob = math.log(class_prob[cls])
        
        for feature, value in zip(features, query_point):
            prob = conditional_prob.get((feature, value, cls), 1e-1)
            log_prob += math.log(prob)
        
        log_probs[cls] = log_prob
    
    return log_probs

query_point = ['Sunny', 'Cool', 'Low', 'Strong']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No  | {query_point}) = {round(log_probs['No'], 4)}")
print("\nQuery point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Sunny', 'Cool', 'Low', 'Strong']) = -6.2691
P(Play = No  | ['Sunny', 'Cool', 'Low', 'Strong']) = -5.9225

Query point: ['Sunny', 'Cool', 'Low', 'Strong']
Output: No


In [56]:
query_point = ['Sunny', 'Cool', 'Low', 'Weak']

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No  | {query_point}) = {round(log_probs['No'], 4)}")
print("\nQuery point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] < log_probs['No'] else "No")


P(Play = Yes | ['Sunny', 'Cool', 'Low', 'Weak']) = -6.2691
P(Play = No  | ['Sunny', 'Cool', 'Low', 'Weak']) = -5.9225

Query point: ['Sunny', 'Cool', 'Low', 'Weak']
Output: Yes


In [86]:
query_point = ['Windy', 'Cool', 'High', 'Strong']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Windy', 'Cool', 'High', 'Strong']) = -5.8745
P(Play = No | ['Windy', 'Cool', 'High', 'Strong']) = -5.1244
Query point: ['Windy', 'Cool', 'High', 'Strong']
Output: No


In [87]:
query_point = ['Windy', 'Cool', 'Low', 'Strong']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Windy', 'Cool', 'Low', 'Strong']) = -7.2608
P(Play = No | ['Windy', 'Cool', 'Low', 'Strong']) = -6.7338
Query point: ['Windy', 'Cool', 'Low', 'Strong']
Output: No


In [88]:
query_point = ['Windy', 'Cool', 'High', 'Weak']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Windy', 'Cool', 'High', 'Weak']) = -5.3149
P(Play = No | ['Windy', 'Cool', 'High', 'Weak']) = -5.4121
Query point: ['Windy', 'Cool', 'High', 'Weak']
Output: Yes


In [67]:

features = df.columns[:-1]
target = df.columns[-1]

# Calculate prior probabilities
class_counts = df[target].value_counts().to_dict()
total_samples = len(df)
priors = {cls: log(count / total_samples) for cls, count in class_counts.items()}

# Calculate conditional probabilities
conditional_probs = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))

for feature in features:
    for cls in class_counts.keys():
        feature_counts = df[df[target] == cls][feature].value_counts().to_dict()
        total_cls = class_counts[cls]
        for value in feature_counts:
            conditional_probs[feature][value][cls] = log((feature_counts[value] + 1) / (total_cls + 2))  # Laplace smoothing

# Function to calculate log probabilities
def calculate_log_probabilities(query_point):
    log_probs = priors.copy()
    for feature, value in zip(features, query_point):
        for cls in class_counts.keys():
            log_probs[cls] += conditional_probs[feature][value].get(cls, log(1 / (class_counts[cls] + 2)))  # Handle missing values
    return log_probs

# Test Case: Humidity = Low, Wind = Weak
query_point = ['Sunny', 'Cool', 'Low', 'Weak']
log_probs = calculate_log_probabilities(query_point)

# Print Output
print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Sunny', 'Cool', 'Low', 'Weak']) = -5.6026
P(Play = No | ['Sunny', 'Cool', 'Low', 'Weak']) = -5.6352
Query point: ['Sunny', 'Cool', 'Low', 'Weak']
Output: Yes


In [76]:
query_point = ['Sunny', 'Cool', 'Low', 'Strong']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Sunny', 'Cool', 'Low', 'Strong']) = -6.1622
P(Play = No | ['Sunny', 'Cool', 'Low', 'Strong']) = -5.3475
Query point: ['Sunny', 'Cool', 'Low', 'Strong']
Output: No


In [77]:
query_point = ['Sunny', 'Cool', 'High', 'Strong']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Sunny', 'Cool', 'High', 'Strong']) = -4.7759
P(Play = No | ['Sunny', 'Cool', 'High', 'Strong']) = -3.7381
Query point: ['Sunny', 'Cool', 'High', 'Strong']
Output: No


In [78]:
query_point = ['Windy', 'Cool', 'High', 'Strong']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Windy', 'Cool', 'High', 'Strong']) = -5.8745
P(Play = No | ['Windy', 'Cool', 'High', 'Strong']) = -5.1244
Query point: ['Windy', 'Cool', 'High', 'Strong']
Output: No


In [79]:
query_point = ['Windy', 'Hot', 'High', 'Strong']
log_probs = calculate_log_probabilities(query_point)

print(f"P(Play = Yes | {query_point}) = {round(log_probs['Yes'], 4)}")
print(f"P(Play = No | {query_point}) = {round(log_probs['No'], 4)}")
print("Query point:", query_point)
print("Output:", "Yes" if log_probs['Yes'] > log_probs['No'] else "No")


P(Play = Yes | ['Windy', 'Hot', 'High', 'Strong']) = -6.1622
P(Play = No | ['Windy', 'Hot', 'High', 'Strong']) = -4.7189
Query point: ['Windy', 'Hot', 'High', 'Strong']
Output: No
