In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

# --- Cell 1: Load and Split Data ---

categories = ['sci.med', 'sci.space']
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

X = data.data
y = data.target
target_names = data.target_names

X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Cell 2: Vectorize Text Data ---

vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train_text).toarray()
X_test = vectorizer.transform(X_test_text).toarray()

# --- Cell 3: Train Naive Bayes ---

alpha = 1.0 # Smoothing parameter
n_samples, n_features = X_train.shape
classes_ = np.unique(y_train)
n_classes = len(classes_)

class_log_prior_ = np.zeros(n_classes)
feature_log_prob_ = np.zeros((n_classes, n_features))

for i, c in enumerate(classes_):
    X_c = X_train[y_train == c]
    
    # Calculate class log prior: log(P(c))
    class_log_prior_[i] = np.log(X_c.shape[0] / n_samples)
    
    total_word_count_in_class = np.sum(X_c)
    word_counts_in_class = np.sum(X_c, axis=0)
    
    # Calculate feature log probability with Laplace smoothing: log(P(w|c))
    numerator = word_counts_in_class + alpha
    denominator = total_word_count_in_class + alpha * n_features
    
    feature_log_prob_[i, :] = np.log(numerator / denominator)

# --- Cell 4: Predict on Test Data ---

# Calculate log probabilities for each class for all test samples
# log(P(c|d)) = log(P(c)) + sum_w( log(P(w|c)) * count(w,d) )
log_probs = X_test @ feature_log_prob_.T + class_log_prior_

# Get the class with the highest log probability for each sample
y_pred = classes_[np.argmax(log_probs, axis=1)]

# --- Cell 5: Evaluate Performance ---

# Calculate Accuracy
accuracy = np.mean(y_test == y_pred)
print("--- Model Evaluation Metrics ---")
print(f"Accuracy: {accuracy:.4f}")

# Calculate Confusion Matrix
cm_classes = np.unique(y_test)
n_cm_classes = len(cm_classes)
cm = np.zeros((n_cm_classes, n_cm_classes), dtype=int)

for i in range(len(y_test)):
    true_idx = np.where(cm_classes == y_test[i])[0][0]
    pred_idx = np.where(cm_classes == y_pred[i])[0][0]
    cm[true_idx, pred_idx] += 1

print("\n--- Confusion Matrix ---")
print(cm)

# Calculate Classification Report
precision = np.diag(cm) / np.sum(cm, axis=0)
recall = np.diag(cm) / np.sum(cm, axis=1)
f1_score = 2 * (precision * recall) / (precision + recall)
support = np.sum(cm, axis=1)

overall_accuracy = np.sum(np.diag(cm)) / np.sum(cm)

avg_precision = np.mean(precision)
avg_recall = np.mean(recall)
avg_f1 = np.mean(f1_score)

weighted_precision = np.sum(precision * support) / np.sum(support)
weighted_recall = np.sum(recall * support) / np.sum(support)
weighted_f1 = np.sum(f1_score * support) / np.sum(support)

# --- Cell 6: Display Classification Report ---

report = "\n--- Classification Report ---\n\n"
report += f"{'':>10} {'precision':>10} {'recall':>10} {'f1-score':>10} {'support':>10}\n\n"

for i, name in enumerate(target_names):
    report += f"{name:>10} {precision[i]:10.4f} {recall[i]:10.4f} {f1_score[i]:10.4f} {support[i]:10}\n"
    
report += "\n"
report += f"{'accuracy':>10} {'':>10} {'':>10} {overall_accuracy:10.4f} {np.sum(support):10}\n"
report += f"{'macro avg':>10} {avg_precision:10.4f} {avg_recall:10.4f} {avg_f1:10.4f} {np.sum(support):10}\n"
report += f"{'weighted avg':>10} {weighted_precision:10.4f} {weighted_recall:10.4f} {weighted_f1:10.4f} {np.sum(support):10}\n"

print(report)



In [2]:
print("henllo")

henllo
