In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# --- Cell 1: Generate Synthetic Data ---

np.random.seed(42)
data_size = 500
spam_count = int(data_size * 0.4) # More balanced for this example
not_spam_count = data_size - spam_count

features_not_spam = pd.DataFrame({
    'word_count': np.random.normal(150, 40, not_spam_count),
    'special_chars': np.random.normal(10, 5, not_spam_count),
    'capitals_ratio': np.random.uniform(0.01, 0.1, not_spam_count)
})
labels_not_spam = pd.Series(np.zeros(not_spam_count, dtype=int))

features_spam = pd.DataFrame({
    'word_count': np.random.normal(250, 50, spam_count),
    'special_chars': np.random.normal(30, 10, spam_count),
    'capitals_ratio': np.random.uniform(0.2, 0.5, spam_count)
})
labels_spam = pd.Series(np.ones(spam_count, dtype=int))

X = pd.concat([features_not_spam, features_spam]).reset_index(drop=True)
y = pd.concat([labels_not_spam, labels_spam]).reset_index(drop=True)

X[X < 0] = 0

# --- Cell 2: Preprocess Data ---

# Map labels from {0, 1} to {-1, 1} for SVM
y_svm = y.map({0: -1, 1: 1}).values

X_train, X_test, y_train, y_test = train_test_split(X, y_svm, test_size=0.3, random_state=42, stratify=y_svm)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Cell 3: SVM "from scratch" Implementation (Sequential) ---

# --- Hyperparameters ---
learning_rate = 0.001
lambda_param = 0.01 # Regularization parameter (C = 1/lambda_param)
n_iters = 1000

# --- Initialization ---
n_samples, n_features = X_train_scaled.shape
w = np.zeros(n_features) # Weights
b = 0 # Bias

# --- Gradient Descent Training Loop ---
for _ in range(n_iters):
    
    # Iterate over each training sample
    for idx, x_i in enumerate(X_train_scaled):
        
        # Calculate condition: y_i * (w . x_i + b) >= 1
        condition = y_train[idx] * (np.dot(x_i, w) + b)
        
        # Hinge Loss Gradient
        if condition >= 1:
            # We are in the correct margin
            # Gradient of regularization term: 2 * lambda * w
            dw = 2 * lambda_param * w
            db = 0
        else:
            # We are inside the margin or on the wrong side
            # Gradient of loss: 2 * lambda * w - y_i * x_i
            # Gradient of bias: -y_i
            dw = 2 * lambda_param * w - y_train[idx] * x_i
            db = -y_train[idx]
        
        # Update weights and bias
        w = w - learning_rate * dw
        b = b - learning_rate * db

# --- Cell 4: Predict ---

# Calculate the linear output: w . x + b
linear_output = np.dot(X_test_scaled, w) + b

# Predictions are the sign of the output
# np.sign returns -1 or 1, which matches our y_test labels
y_pred = np.sign(linear_output)

# --- Cell 5: Evaluate Performance ---

# Map labels back to {0, 1} for metrics
y_test_mapped = (y_test + 1) // 2 # {-1 -> 0, 1 -> 1}
y_pred_mapped = (y_pred + 1) // 2 # {-1 -> 0, 1 -> 1}

# --- Cell 6: Display Results ---

accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
cm = confusion_matrix(y_test_mapped, y_pred_mapped)
report = classification_report(y_test_mapped, y_pred_mapped, target_names=['Not Spam (0)', 'Spam (1)'])

print("--- Model Evaluation Results (SVM From Scratch) ---")
print(f"Accuracy: {accuracy:.4f}")
print("\n--- Confusion Matrix ---")
print(cm)
print("\n--- Classification Report ---")
print(report)

