In [None]:
#ASS17. Implement the Naïve Bayes algorithm from scratch to solve a real-world classification problem
#such as email spam detection, sentiment analysis, or disease diagnosis.

# ==========================================================
# Email Spam Detection using Naïve Bayes (Fully From Scratch)
# ==========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------------
# 1. Load and Prepare Data
# ----------------------------------------------------------
df = pd.read_csv("email.csv")
X = df.drop(columns=["Prediction", "Email No."])
y = df["Prediction"]

data = pd.concat([X, y], axis=1).dropna()
X = data.drop(columns=["Prediction"]).values
y = data["Prediction"].values

np.random.seed(42)
indices = np.arange(len(X))
np.random.shuffle(indices)
split = int(0.7 * len(X))
X_train, X_test = X[indices[:split]], X[indices[split:]]
y_train, y_test = y[indices[:split]], y[indices[split:]]

# ----------------------------------------------------------
# 2. Naïve Bayes Implementation (From Scratch)
# ----------------------------------------------------------
class NaiveBayesScratch:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_priors = {c: np.mean(y == c) for c in self.classes}
        self.mean = {}
        self.var = {}
        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)
            self.var[c] = X_c.var(axis=0) + 1e-9  # avoid zero variance

    def _pdf(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

    def _predict_single(self, x):
        eps = 1e-9
        posteriors = []
        for c in self.classes:
            prior = np.log(self.class_priors[c] + eps)
            probs = self._pdf(c, x)
            probs = np.clip(probs, eps, None)
            likelihood = np.sum(np.log(probs))
            posteriors.append(prior + likelihood)
        return self.classes[np.argmax(posteriors)]

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

# ----------------------------------------------------------
# 3. Train and Predict
# ----------------------------------------------------------
nb_model = NaiveBayesScratch()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

# ----------------------------------------------------------
# 4. Evaluate Manually
# ----------------------------------------------------------
classes = np.unique(y_test)
label_to_index = {label: i for i, label in enumerate(classes)}
cm = np.zeros((len(classes), len(classes)), dtype=int)

for actual, pred in zip(y_test, y_pred):
    cm[label_to_index[actual]][label_to_index[pred]] += 1

tp = cm[1, 1] if len(classes) == 2 else np.diag(cm)
fp = cm.sum(axis=0) - np.diag(cm)
fn = cm.sum(axis=1) - np.diag(cm)

accuracy = np.trace(cm) / np.sum(cm)
precision = np.mean(tp / (tp + fp + 1e-9))
recall = np.mean(tp / (tp + fn + 1e-9))
f1 = 2 * precision * recall / (precision + recall + 1e-9)

print("\nConfusion Matrix (rows=actual, cols=predicted):")
print(classes)
print(cm)
print("\nEvaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")

# ----------------------------------------------------------
# 5. Visualize Confusion Matrix
# ----------------------------------------------------------
plt.figure(figsize=(5,4))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix - Naïve Bayes Spam Classifier (From Scratch)")
plt.xticks(np.arange(len(classes)), classes)
plt.yticks(np.arange(len(classes)), classes)
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")

for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, cm[i, j], ha="center", va="center",
                 color="white" if cm[i, j] > cm.max()/2 else "black")

plt.colorbar()
plt.tight_layout()
plt.show()


# ==============================================================
# Naïve Bayes Sentiment Analysis from Scratch + Visualization
# Dataset: sentinel.csv (columns: text, sentiment)
# ==============================================================

import pandas as pd
import numpy as np
import re
from collections import Counter, defaultdict
import math
import matplotlib.pyplot as plt

# --------------------------------------------------------------
# 1. Load Dataset
# --------------------------------------------------------------
df = pd.read_csv("sentiment.csv", usecols=['text', 'sentiment'])
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

print("Dataset Loaded Successfully! Shape:", df.shape)
print("Unique Sentiments:", df['sentiment'].unique())

# --------------------------------------------------------------
# 2. Preprocess Function
# --------------------------------------------------------------
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

# --------------------------------------------------------------
# 3. Manual Train-Test Split (70–30)
# --------------------------------------------------------------
np.random.seed(42)
indices = np.random.permutation(len(df))
split = int(0.7 * len(df))
train_idx, test_idx = indices[:split], indices[split:]

train_data = df.iloc[train_idx]
test_data = df.iloc[test_idx]

print(f"\nTrain size: {len(train_data)}, Test size: {len(test_data)}")

# --------------------------------------------------------------
# 4. Train Naïve Bayes Model (from scratch)
# --------------------------------------------------------------
def train_naive_bayes(data):
    class_word_counts = defaultdict(Counter)
    class_counts = Counter()
    vocab = set()

    for _, row in data.iterrows():
        label = row['sentiment']
        words = preprocess(row['text'])
        class_counts[label] += 1
        class_word_counts[label].update(words)
        vocab.update(words)

    total_docs = sum(class_counts.values())
    priors = {label: class_counts[label] / total_docs for label in class_counts}
    total_words = {label: sum(class_word_counts[label].values()) for label in class_counts}

    return vocab, class_word_counts, total_words, priors

vocab, class_word_counts, total_words, priors = train_naive_bayes(train_data)
vocab_size = len(vocab)

# --------------------------------------------------------------
# 5. Likelihood with Laplace Smoothing
# --------------------------------------------------------------
def word_likelihood(word, label):
    return (class_word_counts[label][word] + 1) / (total_words[label] + vocab_size)

# --------------------------------------------------------------
# 6. Prediction Function
# --------------------------------------------------------------
def predict(text):
    words = preprocess(text)
    scores = {}
    for label in priors:
        score = math.log(priors[label])
        for word in words:
            if word in vocab:
                score += math.log(word_likelihood(word, label))
        scores[label] = score
    return max(scores, key=scores.get)

# --------------------------------------------------------------
# 7. Evaluate Model
# --------------------------------------------------------------
y_true = test_data['sentiment'].tolist()
y_pred = [predict(t) for t in test_data['text']]

labels = sorted(list(set(y_true)))
label_to_index = {label: i for i, label in enumerate(labels)}

# Confusion Matrix
cm = np.zeros((len(labels), len(labels)), dtype=int)
for actual, pred in zip(y_true, y_pred):
    cm[label_to_index[actual]][label_to_index[pred]] += 1

# Accuracy / Precision / Recall / F1
correct = sum(a == b for a, b in zip(y_true, y_pred))
accuracy = correct / len(y_true)

precision, recall, f1 = {}, {}, {}
for label in labels:
    i = label_to_index[label]
    tp = cm[i, i]
    fp = sum(cm[:, i]) - tp
    fn = sum(cm[i, :]) - tp
    precision[label] = tp / (tp + fp + 1e-9)
    recall[label] = tp / (tp + fn + 1e-9)
    f1[label] = 2 * precision[label] * recall[label] / (precision[label] + recall[label] + 1e-9)

print("\nConfusion Matrix (rows=actual, cols=predicted):")
print(labels)
print(cm)
print(f"\nAccuracy: {accuracy:.4f}")
for label in labels:
    print(f"{label.capitalize()} → Precision: {precision[label]:.4f}, Recall: {recall[label]:.4f}, F1: {f1[label]:.4f}")

# --------------------------------------------------------------
# 8. Visualization
# --------------------------------------------------------------

# (A) Confusion Matrix Heatmap
plt.figure(figsize=(6,5))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix — Naïve Bayes (From Scratch)")
plt.xticks(np.arange(len(labels)), labels)
plt.yticks(np.arange(len(labels)), labels)
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")

# Annotate values
for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, cm[i, j], ha="center", va="center",
                 color="white" if cm[i, j] > cm.max()/2 else "black")
plt.colorbar()
plt.show()

# (B) Class Distribution in Training Data
train_counts = train_data['sentiment'].value_counts()
plt.figure(figsize=(5,4))
train_counts.plot(kind='bar', color=['skyblue','salmon','lightgreen'])
plt.title("Training Data Sentiment Distribution")
plt.xlabel("Sentiment Class")
plt.ylabel("Count")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


# ==========================================================
# Disease Diagnosis using Naïve Bayes (Implemented From Scratch)
# ==========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----------------------------------------------------------
# 1. Load Dataset
# ----------------------------------------------------------
df = pd.read_csv("disease.csv")
print("Dataset Loaded Successfully! Shape:", df.shape)
print("\nFirst 5 Rows:\n", df.head())

# ----------------------------------------------------------
# 2. Data Preprocessing
# ----------------------------------------------------------
# Split Blood Pressure into Systolic/Diastolic
bp_split = df["Blood_Pressure_mmHg"].str.split("/", expand=True)
df["BP_Systolic"] = pd.to_numeric(bp_split[0], errors='coerce')
df["BP_Diastolic"] = pd.to_numeric(bp_split[1], errors='coerce')

# Drop irrelevant columns
df = df.drop(columns=["Patient_ID", "Blood_Pressure_mmHg", "Treatment_Plan"])

# Encode all categorical columns automatically
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].astype("category").cat.codes

# Separate features and target
X = df.drop(columns=["Diagnosis"]).values
y = df["Diagnosis"].values

# ----------------------------------------------------------
# 3. Manual Train/Test Split (70/30)
# ----------------------------------------------------------
np.random.seed(42)
indices = np.arange(len(X))
np.random.shuffle(indices)
split = int(0.7 * len(X))
X_train, X_test = X[indices[:split]], X[indices[split:]]
y_train, y_test = y[indices[:split]], y[indices[split:]]

# ----------------------------------------------------------
# 4. Naïve Bayes Implementation (From Scratch)
# ----------------------------------------------------------
class NaiveBayesScratch:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_priors = {c: np.mean(y == c) for c in self.classes}
        self.mean = {}
        self.var = {}
        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)
            self.var[c] = X_c.var(axis=0) + 1e-9  # avoid zero variance

    def _pdf(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

    def _predict_single(self, x):
        eps = 1e-9
        posteriors = []
        for c in self.classes:
            prior = np.log(self.class_priors[c] + eps)
            probs = np.clip(self._pdf(c, x), eps, None)
            likelihood = np.sum(np.log(probs))
            posteriors.append(prior + likelihood)
        return self.classes[np.argmax(posteriors)]

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

# ----------------------------------------------------------
# 5. Train and Predict
# ----------------------------------------------------------
nb_model = NaiveBayesScratch()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

# ----------------------------------------------------------
# 6. Evaluate Manually
# ----------------------------------------------------------
classes = np.unique(y_test)
label_to_index = {label: i for i, label in enumerate(classes)}
cm = np.zeros((len(classes), len(classes)), dtype=int)

for actual, pred in zip(y_test, y_pred):
    cm[label_to_index[actual]][label_to_index[pred]] += 1

tp = np.diag(cm)
fp = cm.sum(axis=0) - np.diag(cm)
fn = cm.sum(axis=1) - np.diag(cm)

accuracy = np.trace(cm) / np.sum(cm)
precision = np.mean(tp / (tp + fp + 1e-9))
recall = np.mean(tp / (tp + fn + 1e-9))
f1 = 2 * precision * recall / (precision + recall + 1e-9)

print("\nConfusion Matrix:")
print(cm)
print("\nEvaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")

# ----------------------------------------------------------
# 7. Visualize Confusion Matrix
# ----------------------------------------------------------
plt.figure(figsize=(5,4))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix - Naïve Bayes Disease Diagnosis (From Scratch)")
plt.xticks(np.arange(len(classes)), classes)
plt.yticks(np.arange(len(classes)), classes)
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")

for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, cm[i, j], ha="center", va="center",
                 color="white" if cm[i, j] > cm.max()/2 else "black")

plt.colorbar()
plt.tight_layout()
plt.show()

KeyboardInterrupt: 