In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from math import log

# 1. Load and Clean the Data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
columns = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
    'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
    'stalk-surface-below-ring', 'stalk-color-above-ring',
    'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
    'ring-type', 'spore-print-color', 'population', 'habitat'
]
df = pd.read_csv(url, names=columns)

print( df)

# 2. Clean: remove rows with missing values
df = df[df['stalk-root'] != '?']

# 3. Encode categorical variables manually
# Map each category to an integer
label_encoders = {}
encoded_df = pd.DataFrame()

for col in df.columns:
    unique_vals = df[col].unique()
    label_encoders[col] = {val: idx for idx, val in enumerate(unique_vals)}
    encoded_df[col] = df[col].map(label_encoders[col])

# 4. Split features and target
X_all = encoded_df.drop('class', axis=1).values
y_all = encoded_df['class'].values
classes = np.unique(y_all)
n_classes = len(classes)

# 5. Naive Bayes Training and Prediction
def train_naive_bayes(X_train, y_train):
    n_features = X_train.shape[1]
    class_counts = defaultdict(int)
    feature_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    feature_values = [set() for _ in range(n_features)]

    for i in range(len(X_train)):
        label = y_train[i]
        class_counts[label] += 1
        for j in range(n_features):
            val = X_train[i][j]
            feature_counts[label][j][val] += 1
            feature_values[j].add(val)

    total_samples = len(X_train)
    priors = {c: class_counts[c] / total_samples for c in classes}

    conditionals = {}
    for c in classes:
        conditionals[c] = {}
        for j in range(n_features):
            conditionals[c][j] = {}
            total_c = class_counts[c]
            k = len(feature_values[j])
            for xj in feature_values[j]:
                count = feature_counts[c][j][xj]
                conditionals[c][j][xj] = (count + 1) / (total_c + k)
    return priors, conditionals, class_counts, feature_values

# 6. Predict function
def predict(X, priors, conditionals, class_counts, feature_values):
    n_features = X.shape[1]
    predictions = []
    for instance in X:
        log_posteriors = {}
        for c in classes:
            log_prob = log(priors[c])
            for j in range(n_features):
                xj = instance[j]
                prob = conditionals[c][j].get(xj, 1 / (class_counts[c] + len(feature_values[j])))
                log_prob += log(prob)
            log_posteriors[c] = log_prob
        predictions.append(max(log_posteriors, key=log_posteriors.get))
    return np.array(predictions)

# Task 1: Using All 22 Features
split_idx = int(0.6 * len(X_all))
X_train_all, X_test_all = X_all[:split_idx], X_all[split_idx:]
y_train_all, y_test_all = y_all[:split_idx], y_all[split_idx:]

priors_all, conds_all, counts_all, values_all = train_naive_bayes(X_train_all, y_train_all)
y_pred_all = predict(X_test_all, priors_all, conds_all, counts_all, values_all)
accuracy_all = np.mean(y_pred_all == y_test_all)
print("Task 1 Accuracy (All Features):", accuracy_all)

# Task 2: Using Only 10 Features
selected_features = [
    'odor', 'gill-color', 'spore-print-color', 'ring-type', 'gill-size',
    'bruises', 'population', 'habitat', 'cap-color', 'cap-surface'
]
X_10 = encoded_df[selected_features].values
y_10 = encoded_df['class'].values

split_idx = int(0.6 * len(X_10))
X_train_10, X_test_10 = X_10[:split_idx], X_10[split_idx:]
y_train_10, y_test_10 = y_10[:split_idx], y_10[split_idx:]

priors_10, conds_10, counts_10, values_10 = train_naive_bayes(X_train_10, y_train_10)
y_pred_10 = predict(X_test_10, priors_10, conds_10, counts_10, values_10)
accuracy_10 = np.mean(y_pred_10 == y_test_10)
print("Task 2 Accuracy (10 Features):", accuracy_10)


     class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0        p         x           s         n       t    p               f   
1        e         x           s         y       t    a               f   
2        e         b           s         w       t    l               f   
3        p         x           y         w       t    p               f   
4        e         x           s         g       f    n               f   
...    ...       ...         ...       ...     ...  ...             ...   
8119     e         k           s         n       f    n               a   
8120     e         x           s         n       f    n               a   
8121     e         f           s         n       f    n               a   
8122     p         k           y         n       f    y               f   
8123     e         x           s         n       f    n               a   

     gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0               c         n