17

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

# --- Cell 1: Load and Split Data ---

categories = ['sci.med', 'sci.space']
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

X = data.data
y = data.target
target_names = data.target_names

X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Cell 2: Vectorize Text Data ---

vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train_text).toarray()
X_test = vectorizer.transform(X_test_text).toarray()

# --- Cell 3: Train Naive Bayes ---

alpha = 1.0 # Smoothing parameter
n_samples, n_features = X_train.shape
classes_ = np.unique(y_train)
n_classes = len(classes_)

class_log_prior_ = np.zeros(n_classes)
feature_log_prob_ = np.zeros((n_classes, n_features))

for i, c in enumerate(classes_):
    X_c = X_train[y_train == c]

    # Calculate class log prior: log(P(c))
    class_log_prior_[i] = np.log(X_c.shape[0] / n_samples)

    total_word_count_in_class = np.sum(X_c)
    word_counts_in_class = np.sum(X_c, axis=0)

    # Calculate feature log probability with Laplace smoothing: log(P(w|c))
    numerator = word_counts_in_class + alpha
    denominator = total_word_count_in_class + alpha * n_features

    feature_log_prob_[i, :] = np.log(numerator / denominator)

# --- Cell 4: Predict on Test Data ---

# Calculate log probabilities for each class for all test samples
# log(P(c|d)) = log(P(c)) + sum_w( log(P(w|c)) * count(w,d) )
log_probs = X_test @ feature_log_prob_.T + class_log_prior_

# Get the class with the highest log probability for each sample
y_pred = classes_[np.argmax(log_probs, axis=1)]

# --- Cell 5: Evaluate Performance ---

# Calculate Accuracy
accuracy = np.mean(y_test == y_pred)
print("--- Model Evaluation Metrics ---")
print(f"Accuracy: {accuracy:.4f}")

# Calculate Confusion Matrix
cm_classes = np.unique(y_test)
n_cm_classes = len(cm_classes)
cm = np.zeros((n_cm_classes, n_cm_classes), dtype=int)

for i in range(len(y_test)):
    true_idx = np.where(cm_classes == y_test[i])[0][0]
    pred_idx = np.where(cm_classes == y_pred[i])[0][0]
    cm[true_idx, pred_idx] += 1

print("\n--- Confusion Matrix ---")
print(cm)

# Calculate Classification Report
precision = np.diag(cm) / np.sum(cm, axis=0)
recall = np.diag(cm) / np.sum(cm, axis=1)
f1_score = 2 * (precision * recall) / (precision + recall)
support = np.sum(cm, axis=1)

overall_accuracy = np.sum(np.diag(cm)) / np.sum(cm)

avg_precision = np.mean(precision)
avg_recall = np.mean(recall)
avg_f1 = np.mean(f1_score)

weighted_precision = np.sum(precision * support) / np.sum(support)
weighted_recall = np.sum(recall * support) / np.sum(support)
weighted_f1 = np.sum(f1_score * support) / np.sum(support)

# --- Cell 6: Display Classification Report ---

report = "\n--- Classification Report ---\n\n"
report += f"{'':>10} {'precision':>10} {'recall':>10} {'f1-score':>10} {'support':>10}\n\n"

for i, name in enumerate(target_names):
    report += f"{name:>10} {precision[i]:10.4f} {recall[i]:10.4f} {f1_score[i]:10.4f} {support[i]:10}\n"

report += "\n"
report += f"{'accuracy':>10} {'':>10} {'':>10} {overall_accuracy:10.4f} {np.sum(support):10}\n"
report += f"{'macro avg':>10} {avg_precision:10.4f} {avg_recall:10.4f} {avg_f1:10.4f} {np.sum(support):10}\n"
report += f"{'weighted avg':>10} {weighted_precision:10.4f} {weighted_recall:10.4f} {weighted_f1:10.4f} {np.sum(support):10}\n"

print(report)



--- Model Evaluation Metrics ---
Accuracy: 0.9848

--- Confusion Matrix ---
[[286   6]
 [  3 299]]

--- Classification Report ---

            precision     recall   f1-score    support

   sci.med     0.9896     0.9795     0.9845        292
 sci.space     0.9803     0.9901     0.9852        302

  accuracy                           0.9848        594
 macro avg     0.9850     0.9848     0.9848        594
weighted avg     0.9849     0.9848     0.9848        594



18

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

# --- Cell 1: Load and Split Data ---

categories = ['sci.med', 'sci.space']
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

X = data.data
y = data.target
target_names = data.target_names

X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Cell 2: Vectorize Text Data ---

vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train_text).toarray()
X_test = vectorizer.transform(X_test_text).toarray()

# --- Cell 3: Train Naive Bayes ---

alpha = 1.0 # Smoothing parameter
n_samples, n_features = X_train.shape
classes_ = np.unique(y_train)
n_classes = len(classes_)

class_log_prior_ = np.zeros(n_classes)
feature_log_prob_ = np.zeros((n_classes, n_features))

for i, c in enumerate(classes_):
    X_c = X_train[y_train == c]

    # Calculate class log prior: log(P(c))
    class_log_prior_[i] = np.log(X_c.shape[0] / n_samples)

    total_word_count_in_class = np.sum(X_c)
    word_counts_in_class = np.sum(X_c, axis=0)

    # Calculate feature log probability with Laplace smoothing: log(P(w|c))
    numerator = word_counts_in_class + alpha
    denominator = total_word_count_in_class + alpha * n_features

    feature_log_prob_[i, :] = np.log(numerator / denominator)

# --- Cell 4: Predict on Test Data ---

# Calculate log probabilities for each class for all test samples
# log(P(c|d)) = log(P(c)) + sum_w( log(P(w|c)) * count(w,d) )
log_probs = X_test @ feature_log_prob_.T + class_log_prior_

# Get the class with the highest log probability for each sample
y_pred = classes_[np.argmax(log_probs, axis=1)]

# --- Cell 5: Evaluate Performance ---

# Calculate Accuracy
accuracy = np.mean(y_test == y_pred)
print("--- Model Evaluation Metrics ---")
print(f"Accuracy: {accuracy:.4f}")

# Calculate Confusion Matrix
cm_classes = np.unique(y_test)
n_cm_classes = len(cm_classes)
cm = np.zeros((n_cm_classes, n_cm_classes), dtype=int)

for i in range(len(y_test)):
    true_idx = np.where(cm_classes == y_test[i])[0][0]
    pred_idx = np.where(cm_classes == y_pred[i])[0][0]
    cm[true_idx, pred_idx] += 1

print("\n--- Confusion Matrix ---")
print(cm)

# Calculate Classification Report
precision = np.diag(cm) / np.sum(cm, axis=0)
recall = np.diag(cm) / np.sum(cm, axis=1)
f1_score = 2 * (precision * recall) / (precision + recall)
support = np.sum(cm, axis=1)

overall_accuracy = np.sum(np.diag(cm)) / np.sum(cm)

avg_precision = np.mean(precision)
avg_recall = np.mean(recall)
avg_f1 = np.mean(f1_score)

weighted_precision = np.sum(precision * support) / np.sum(support)
weighted_recall = np.sum(recall * support) / np.sum(support)
weighted_f1 = np.sum(f1_score * support) / np.sum(support)

# --- Cell 6: Display Classification Report ---

report = "\n--- Classification Report ---\n\n"
report += f"{'':>10} {'precision':>10} {'recall':>10} {'f1-score':>10} {'support':>10}\n\n"

for i, name in enumerate(target_names):
    report += f"{name:>10} {precision[i]:10.4f} {recall[i]:10.4f} {f1_score[i]:10.4f} {support[i]:10}\n"

report += "\n"
report += f"{'accuracy':>10} {'':>10} {'':>10} {overall_accuracy:10.4f} {np.sum(support):10}\n"
report += f"{'macro avg':>10} {avg_precision:10.4f} {avg_recall:10.4f} {avg_f1:10.4f} {np.sum(support):10}\n"
report += f"{'weighted avg':>10} {weighted_precision:10.4f} {weighted_recall:10.4f} {weighted_f1:10.4f} {np.sum(support):10}\n"

print(report)




--- Model Evaluation Metrics ---
Accuracy: 0.9848

--- Confusion Matrix ---
[[286   6]
 [  3 299]]

--- Classification Report ---

            precision     recall   f1-score    support

   sci.med     0.9896     0.9795     0.9845        292
 sci.space     0.9803     0.9901     0.9852        302

  accuracy                           0.9848        594
 macro avg     0.9850     0.9848     0.9848        594
weighted avg     0.9849     0.9848     0.9848        594



19

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# --- Cell 1: Generate Synthetic Data ---

np.random.seed(42)
data_size = 500
spam_count = int(data_size * 0.4) # More balanced for this example
not_spam_count = data_size - spam_count

features_not_spam = pd.DataFrame({
    'word_count': np.random.normal(150, 40, not_spam_count),
    'special_chars': np.random.normal(10, 5, not_spam_count),
    'capitals_ratio': np.random.uniform(0.01, 0.1, not_spam_count)
})
labels_not_spam = pd.Series(np.zeros(not_spam_count, dtype=int))

features_spam = pd.DataFrame({
    'word_count': np.random.normal(250, 50, spam_count),
    'special_chars': np.random.normal(30, 10, spam_count),
    'capitals_ratio': np.random.uniform(0.2, 0.5, spam_count)
})
labels_spam = pd.Series(np.ones(spam_count, dtype=int))

X = pd.concat([features_not_spam, features_spam]).reset_index(drop=True)
y = pd.concat([labels_not_spam, labels_spam]).reset_index(drop=True)

X[X < 0] = 0

# --- Cell 2: Preprocess Data ---

# Map labels from {0, 1} to {-1, 1} for SVM
y_svm = y.map({0: -1, 1: 1}).values

X_train, X_test, y_train, y_test = train_test_split(X, y_svm, test_size=0.3, random_state=42, stratify=y_svm)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Cell 3: SVM "from scratch" Implementation (Sequential) ---

# --- Hyperparameters ---
learning_rate = 0.001
lambda_param = 0.01 # Regularization parameter (C = 1/lambda_param)
n_iters = 1000

# --- Initialization ---
n_samples, n_features = X_train_scaled.shape
w = np.zeros(n_features) # Weights
b = 0 # Bias

# --- Gradient Descent Training Loop ---
for _ in range(n_iters):

    # Iterate over each training sample
    for idx, x_i in enumerate(X_train_scaled):

        # Calculate condition: y_i * (w . x_i + b) >= 1
        condition = y_train[idx] * (np.dot(x_i, w) + b)

        # Hinge Loss Gradient
        if condition >= 1:
            # We are in the correct margin
            # Gradient of regularization term: 2 * lambda * w
            dw = 2 * lambda_param * w
            db = 0
        else:
            # We are inside the margin or on the wrong side
            # Gradient of loss: 2 * lambda * w - y_i * x_i
            # Gradient of bias: -y_i
            dw = 2 * lambda_param * w - y_train[idx] * x_i
            db = -y_train[idx]

        # Update weights and bias
        w = w - learning_rate * dw
        b = b - learning_rate * db

# --- Cell 4: Predict ---

# Calculate the linear output: w . x + b
linear_output = np.dot(X_test_scaled, w) + b

# Predictions are the sign of the output
# np.sign returns -1 or 1, which matches our y_test labels
y_pred = np.sign(linear_output)

# --- Cell 5: Evaluate Performance ---

# Map labels back to {0, 1} for metrics
y_test_mapped = (y_test + 1) // 2 # {-1 -> 0, 1 -> 1}
y_pred_mapped = (y_pred + 1) // 2 # {-1 -> 0, 1 -> 1}

# --- Cell 6: Display Results ---

accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
cm = confusion_matrix(y_test_mapped, y_pred_mapped)
report = classification_report(y_test_mapped, y_pred_mapped, target_names=['Not Spam (0)', 'Spam (1)'])

print("--- Model Evaluation Results (SVM From Scratch) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\n--- Confusion Matrix ---")
print(cm)
print("\n--- Classification Report ---")
print(report)



--- Model Evaluation Results (SVM From Scratch) ---
Accuracy: 1.0000

--- Confusion Matrix ---
[[90  0]
 [ 0 60]]

--- Classification Report ---
              precision    recall  f1-score   support

Not Spam (0)       1.00      1.00      1.00        90
    Spam (1)       1.00      1.00      1.00        60

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150



20


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# --- Cell 1: Generate Synthetic Data ---
# Create a synthetic dataset for student performance
np.random.seed(42)
data_size = 500

# Features: study time (hours/week), absences (days), internal scores (0-50)
study_time = np.random.uniform(1, 20, data_size)
absences = np.random.randint(0, 30, data_size)
internal_scores = np.random.uniform(0, 50, data_size)

# Target: Pass (1) or Fail (0)
# Create a non-linear relationship:
# High internal scores + high study time = Pass
# High absences = Fail
# (internal_scores/50 + study_time/20) - (absences/30) + noise
probability = (internal_scores/50 + study_time/20) - (absences/30) + np.random.normal(0, 0.2, data_size)
y = (probability > 0.8).astype(int) # Threshold defines pass/fail

X = pd.DataFrame({
    'study_time': study_time,
    'absences': absences,
    'internal_scores': internal_scores
})

# --- Cell 2: Preprocess Data ---

# Map labels from {0, 1} to {-1, 1} for SVM
y_svm = y.copy()
y_svm[y == 0] = -1

X_train, X_test, y_train, y_test = train_test_split(X, y_svm, test_size=0.3, random_state=42, stratify=y_svm)

# 1. Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Apply Polynomial Feature Expansion (degree=2)
# This explicitly creates the polynomial features (e.g., a*b, a^2, b^2)
# We then run a *linear* SVM on these *new* features.
# This is equivalent to using a polynomial kernel in the primal form.
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# --- Cell 3: SVM "from scratch" Implementation (Sequential) ---
# This is the same linear SVM logic as before, but applied to the
# polynomial-expanded feature set (X_train_poly).

# --- Hyperparameters ---
learning_rate = 0.001
lambda_param = 0.01 # Regularization parameter
n_iters = 1000

# --- Initialization ---
n_samples, n_features = X_train_poly.shape
w = np.zeros(n_features) # Weights
b = 0 # Bias

# --- Gradient Descent Training Loop ---
for _ in range(n_iters):
    for idx, x_i in enumerate(X_train_poly):

        # Calculate condition: y_i * (w . x_i + b) >= 1
        condition = y_train[idx] * (np.dot(x_i, w) + b)

        # Hinge Loss Gradient
        if condition >= 1:
            # Correct margin
            dw = 2 * lambda_param * w
            db = 0
        else:
            # Inside margin or wrong side
            dw = 2 * lambda_param * w - y_train[idx] * x_i
            db = -y_train[idx]

        # Update weights and bias
        w = w - learning_rate * dw
        b = b - learning_rate * db

# --- Cell 4: Predict ---

# Calculate the linear output on the *polynomial* test features
linear_output = np.dot(X_test_poly, w) + b

# Predictions are the sign of the output
y_pred = np.sign(linear_output)

# --- Cell 5: Evaluate Performance ---

# Map labels back to {0, 1} for metrics
y_test_mapped = (y_test + 1) // 2 # {-1 -> 0, 1 -> 1}
y_pred_mapped = (y_pred + 1) // 2 # {-1 -> 0, 1 -> 1}
# Handle cases where all preds are -1 (mapped to 0)
y_pred_mapped[y_pred_mapped == -1] = 0


# --- Cell 6: Display Results ---

accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
cm = confusion_matrix(y_test_mapped, y_pred_mapped)

# Calculate individual metrics for each class
precision = precision_score(y_test_mapped, y_pred_mapped, average=None, labels=[0, 1], zero_division=0)
recall = recall_score(y_test_mapped, y_pred_mapped, average=None, labels=[0, 1], zero_division=0)
f1 = f1_score(y_test_mapped, y_pred_mapped, average=None, labels=[0, 1], zero_division=0)

print("--- Model Evaluation Results (SVM From Scratch w/ Poly Features) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\n--- Confusion Matrix ---")
print(cm)
print("\n--- Performance Metrics ---")

target_names = ['Fail (0)', 'Pass (1)']
print(f"{'':<10} {'precision':<10} {'recall':<10} {'f1-score':<10}")
print("-" * 40)
for i in range(len(target_names)):
    print(f"{target_names[i]:<10} {precision[i]:<10.4f} {recall[i]:<10.4f} {f1[i]:<10.4f}")

# Get metrics for the "Pass" class (label 1)
precision_pass = precision[1]
recall_pass = recall[1]
f1_pass = f1[1]

print("\n--- Key Metrics (for 'Pass' class) ---")
print(f"Precision: {precision_pass:.4f}")
print(f"Recall:    {recall_pass:.4f}")
print(f"F1-Score:  {f1_pass:.4f}")




--- Model Evaluation Results (SVM From Scratch w/ Poly Features) ---
Accuracy: 0.8867

--- Confusion Matrix ---
[[92  7]
 [10 41]]

--- Performance Metrics ---
           precision  recall     f1-score  
----------------------------------------
Fail (0)   0.9020     0.9293     0.9154    
Pass (1)   0.8542     0.8039     0.8283    

--- Key Metrics (for 'Pass' class) ---
Precision: 0.8542
Recall:    0.8039
F1-Score:  0.8283


21


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

# --- Cell 1: Load Data ---
data = load_breast_cancer()
X = data.data
y = data.target

# --- Cell 2: Preprocess Data ---

# Map labels from {0, 1} to {-1, 1} for SVM
# Malignant (0) -> -1
# Benign (1) -> 1
y_svm = y.copy()
y_svm[y == 0] = -1

X_train, X_test, y_train, y_test = train_test_split(X, y_svm, test_size=0.3, random_state=42, stratify=y_svm)

# 1. Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Apply Polynomial Feature Expansion (degree=2)
# We will run a linear SVM on these new features,
# which is equivalent to a polynomial kernel in the primal form.
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# --- Cell 3: SVM "from scratch" Implementation (Sequential) ---
# This is a linear SVM applied to the polynomial-expanded feature set.

# --- Hyperparameters ---
learning_rate = 0.0001 # Smaller learning rate for stability
lambda_param = 0.01 # Regularization parameter
n_iters = 1000

# --- Initialization ---
n_samples, n_features = X_train_poly.shape
w = np.zeros(n_features) # Weights
b = 0 # Bias

# --- Gradient Descent Training Loop ---
for _ in range(n_iters):
    for idx, x_i in enumerate(X_train_poly):

        # Calculate condition: y_i * (w . x_i + b) >= 1
        condition = y_train[idx] * (np.dot(x_i, w) + b)

        # Hinge Loss Gradient
        if condition >= 1:
            # Correct margin
            dw = 2 * lambda_param * w
            db = 0
        else:
            # Inside margin or wrong side
            dw = 2 * lambda_param * w - y_train[idx] * x_i
            db = -y_train[idx]

        # Update weights and bias
        w = w - learning_rate * dw
        b = b - learning_rate * db

# --- Cell 4: Predict ---

# Calculate the linear output (decision scores)
# These scores are used for the ROC curve
y_scores = np.dot(X_test_poly, w) + b

# Predictions are the sign of the output
y_pred = np.sign(y_scores)

# --- Cell 5: Evaluate Performance ---

# Map labels back to {0, 1} for metrics
y_test_mapped = (y_test + 1) // 2 # {-1 -> 0, 1 -> 1}
y_pred_mapped = (y_pred + 1) // 2 # {-1 -> 0, 1 -> 1}
# Handle cases where all preds are -1 (mapped to 0)
y_pred_mapped[y_pred_mapped == -1] = 0

# --- Confusion Matrix and Accuracy ---
accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
cm = confusion_matrix(y_test_mapped, y_pred_mapped)

print("--- Model Evaluation Results (SVM From Scratch w/ Poly Features) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\n--- Confusion Matrix ---")
print("True Neg (Malignant) | False Pos (Benign)")
print("False Neg (Malignant)| True Pos (Benign)")
print(cm)

# --- ROC Curve and AUC ---
# Use the original {0, 1} test labels and the continuous scores
fpr, tpr, thresholds = roc_curve(y_test_mapped, y_scores)
roc_auc = auc(fpr, tpr)

print(f"\nAUC (Area Under Curve): {roc_auc:.4f}")

# --- Cell 6: Plot ROC Curve ---
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()



IndentationError: expected an indented block after 'for' statement on line 50 (ipython-input-531363624.py, line 53)