In [1]:
import numpy as np
from IPython.display import HTML,Javascript, display

training_spam = np.loadtxt(open("data/training_spam.csv"), delimiter=",").astype(int)
print("Shape of the spam training data set:", training_spam.shape)
print(training_spam)

Shape of the spam training data set: (1000, 55)
[[1 0 0 ... 0 0 0]
 [0 0 1 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [1 1 1 ... 1 1 0]
 [1 0 0 ... 1 1 1]]


In [2]:
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)
print("Shape of the spam testing data set:", testing_spam.shape)
print(testing_spam)

Shape of the spam testing data set: (500, 55)
[[1 0 0 ... 1 1 1]
 [1 1 0 ... 1 1 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]]


In [3]:
import numpy as np
from itertools import product
import time

class SpamClassifier:
    def __init__(self, k, learning_rate=0.01, epochs=1000):
        self.k = k  # Unused for logistic regression but kept for compatibility
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def train(self, X_train, y_train):
        # Initialize weights and bias
        n_features = X_train.shape[1]
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # Gradient Descent
        for _ in range(self.epochs):
            linear_model = np.dot(X_train, self.weights) + self.bias
            predictions = self.sigmoid(linear_model)
            
            # Compute gradients
            dw = np.dot(X_train.T, (predictions - y_train)) / len(y_train)
            db = np.sum(predictions - y_train) / len(y_train)
            
            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
    def predict(self, X_test):
        linear_model = np.dot(X_test, self.weights) + self.bias
        predictions = self.sigmoid(linear_model)
        return np.where(predictions >= 0.5, 1, 0)

def evaluate_model(classifier, X_val, y_val):
    predictions = classifier.predict(X_val)
    accuracy = np.mean(predictions == y_val)
    return accuracy

def hyperparameter_tuning(X_train, y_train, X_val, y_val, learning_rates, epochs_list):
    best_accuracy = 0
    best_params = {}
    
    for lr, epochs in product(learning_rates, epochs_list):
        start_time = time.time()
        classifier = SpamClassifier(k=1, learning_rate=lr, epochs=epochs)
        classifier.train(X_train, y_train)
        accuracy = evaluate_model(classifier, X_val, y_val)
        end_time = time.time()

        print(f"Learning Rate: {lr}, Epochs: {epochs}, Accuracy: {accuracy:.4f}, Time: {end_time - start_time:.2f}s")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'learning_rate': lr, 'epochs': epochs}
    
    return best_params, best_accuracy

# Load training and testing data
training_spam = np.loadtxt(open("data/training_spam.csv"), delimiter=",").astype(int)
testing_spam = np.loadtxt(open("data/testing_spam.csv"), delimiter=",").astype(int)

print("Shape of the spam training data set:", training_spam.shape)
print("Shape of the spam testing data set:", testing_spam.shape)

# Split into features and labels
X = training_spam[:, 1:]  # Features
y = training_spam[:, 0]   # Labels

# Split training data into training and validation sets (e.g., 80% train, 20% validation)
split_index = int(0.8 * len(X))
X_train, X_val = X[:split_index], X[split_index:]
y_train, y_val = y[:split_index], y[split_index:]

X_test = testing_spam[:, 1:]    # Test features
y_test = testing_spam[:, 0]     # Test labels (if available)

# Hyperparameter Tuning
learning_rates = [0.001, 0.01, 0.1]
epochs_list = [500, 1000, 1500]

best_params, best_accuracy = hyperparameter_tuning(X_train, y_train, X_val, y_val, learning_rates, epochs_list)
print(f"Best Hyperparameters: {best_params}, Best Validation Accuracy: {best_accuracy:.4f}")

# Train with Best Hyperparameters
classifier = SpamClassifier(k=1, learning_rate=best_params['learning_rate'], epochs=best_params['epochs'])
classifier.train(X_train, y_train)

# Evaluate on Test Set
predictions = classifier.predict(X_test)
test_accuracy = np.mean(predictions == y_test) if len(y_test) > 0 else None

if test_accuracy is not None:
    print(f"Test Accuracy: {test_accuracy:.4f}")
else:
    print("No labels provided for the test set.")

# Show Predictions
print("Predictions on Test Set:", predictions)


Shape of the spam training data set: (1000, 55)
Shape of the spam testing data set: (500, 55)
Learning Rate: 0.001, Epochs: 500, Accuracy: 0.9200, Time: 0.13s
Learning Rate: 0.001, Epochs: 1000, Accuracy: 0.9200, Time: 0.24s
Learning Rate: 0.001, Epochs: 1500, Accuracy: 0.9300, Time: 0.35s
Learning Rate: 0.01, Epochs: 500, Accuracy: 0.9350, Time: 0.12s
Learning Rate: 0.01, Epochs: 1000, Accuracy: 0.9350, Time: 0.21s
Learning Rate: 0.01, Epochs: 1500, Accuracy: 0.9400, Time: 0.35s
Learning Rate: 0.1, Epochs: 500, Accuracy: 0.9450, Time: 0.11s
Learning Rate: 0.1, Epochs: 1000, Accuracy: 0.9500, Time: 0.21s
Learning Rate: 0.1, Epochs: 1500, Accuracy: 0.9500, Time: 0.36s
Best Hyperparameters: {'learning_rate': 0.1, 'epochs': 1000}, Best Validation Accuracy: 0.9500
Test Accuracy: 0.9220
Predictions on Test Set: [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 1
 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0

In [9]:
import numpy as np
from itertools import product
import time

class XGBoostSpamClassifier:
    def __init__(self, k, learning_rate=0.1, epochs=100):
        self.k = k  # Compatibility with template
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.trees = []

    def train(self, X_train, y_train):
        predictions = np.zeros(len(y_train))
        for _ in range(self.epochs):
            residuals = y_train - predictions
            tree = self.simple_tree(X_train, residuals)
            predictions += self.learning_rate * tree.predict(X_train)
            self.trees.append(tree)

    def simple_tree(self, X, y):
        feature_idx = np.argmax(np.var(X, axis=0))
        threshold = np.median(X[:, feature_idx])

        class Tree:
            def __init__(self, left_value, right_value, feature_idx, threshold):
                self.left_value = left_value
                self.right_value = right_value
                self.feature_idx = feature_idx
                self.threshold = threshold

            def predict(self, X):
                return np.where(X[:, self.feature_idx] < self.threshold, self.left_value, self.right_value)

        left_value = np.mean(y[X[:, feature_idx] < threshold])
        right_value = np.mean(y[X[:, feature_idx] >= threshold])

        return Tree(left_value, right_value, feature_idx, threshold)

    def predict(self, X_test):
        predictions = np.zeros(X_test.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X_test)
        return np.where(predictions >= 0.5, 1, 0)


def evaluate_model(classifier, X_val, y_val):
    predictions = classifier.predict(X_val)
    accuracy = np.mean(predictions == y_val)
    return accuracy

def hyperparameter_tuning(X_train, y_train, X_val, y_val, learning_rates, epochs_list):
    best_accuracy = 0
    best_params = {}
    
    for lr, epochs in product(learning_rates, epochs_list):
        start_time = time.time()
        classifier = XGBoostSpamClassifier(k=1, learning_rate=lr, epochs=epochs)
        classifier.train(X_train, y_train)
        accuracy = evaluate_model(classifier, X_val, y_val)
        end_time = time.time()

        print(f"Learning Rate: {lr}, Epochs: {epochs}, Accuracy: {accuracy:.4f}, Time: {end_time - start_time:.2f}s")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'learning_rate': lr, 'epochs': epochs}
    
    return best_params, best_accuracy


In [None]:
import numpy as np
from itertools import product
import time

# ==============================
# EVALUATION FUNCTION
# ==============================
def evaluate_model(classifier, X_val, y_val):
    predictions = classifier.predict(X_val)
    accuracy = np.mean(predictions == y_val)
    return accuracy

# ==============================
# HYPERPARAMETER TUNING FUNCTION
# ==============================
def hyperparameter_tuning(ClassifierClass, X_train, y_train, X_val, y_val, learning_rates, epochs_list):
    best_accuracy = 0
    best_params = {}
    
    for lr, epochs in product(learning_rates, epochs_list):
        start_time = time.time()
        classifier = ClassifierClass(k=1, learning_rate=lr, epochs=epochs)
        classifier.train(X_train, y_train)
        accuracy = evaluate_model(classifier, X_val, y_val)
        end_time = time.time()

        print(f"{ClassifierClass.__name__} - LR: {lr}, Epochs: {epochs}, Accuracy: {accuracy:.4f}, Time: {end_time - start_time:.2f}s")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'learning_rate': lr, 'epochs': epochs}
    
    print(f"Best Params for {ClassifierClass.__name__}: {best_params}, Best Accuracy: {best_accuracy:.4f}")
    return best_params, best_accuracy

# ==============================
# SIMPLE DECISION TREE (FOR BOOSTING MODELS)
# ==============================
class SimpleDecisionTree:
    def fit(self, X, y):
        feature_idx = np.argmax(np.var(X, axis=0))
        threshold = np.median(X[:, feature_idx])
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.left_value = np.mean(y[X[:, feature_idx] < threshold])
        self.right_value = np.mean(y[X[:, feature_idx] >= threshold])

    def predict(self, X):
        return np.where(X[:, self.feature_idx] < self.threshold, self.left_value, self.right_value)


# ==============================
# 1️⃣ XGBoost Classifier
# ==============================
class XGBoostSpamClassifier:
    def __init__(self, k, learning_rate=0.1, epochs=100):
        self.k = k
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.trees = []

    def train(self, X_train, y_train):
        predictions = np.zeros(len(y_train))
        for _ in range(self.epochs):
            residuals = y_train - predictions
            tree = SimpleDecisionTree()
            tree.fit(X_train, residuals)
            predictions += self.learning_rate * tree.predict(X_train)
            self.trees.append(tree)

    def predict(self, X_test):
        predictions = np.zeros(X_test.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X_test)
        return np.where(predictions >= 0.5, 1, 0)


# ==============================
# 2️⃣ LightGBM Classifier
# ==============================
class LightGBMSpamClassifier:
    def __init__(self, k, learning_rate=0.1, epochs=100):
        self.k = k
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.trees = []

    def train(self, X_train, y_train):
        predictions = np.zeros(len(y_train))
        for _ in range(self.epochs):
            residuals = y_train - predictions
            tree = SimpleDecisionTree()
            tree.fit(X_train, residuals)
            predictions += self.learning_rate * tree.predict(X_train)
            self.trees.append(tree)

    def predict(self, X_test):
        predictions = np.zeros(X_test.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X_test)
        return np.where(predictions >= 0.5, 1, 0)


# ==============================
# 3️⃣ CatBoost Classifier
# ==============================
class CatBoostSpamClassifier:
    def __init__(self, k, learning_rate=0.1, epochs=100):
        self.k = k
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.trees = []

    def train(self, X_train, y_train):
        predictions = np.zeros(len(y_train))
        for _ in range(self.epochs):
            residuals = y_train - predictions
            tree = SimpleDecisionTree()
            tree.fit(X_train, residuals)
            predictions += self.learning_rate * tree.predict(X_train)
            self.trees.append(tree)

    def predict(self, X_test):
        predictions = np.zeros(X_test.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X_test)
        return np.where(predictions >= 0.5, 1, 0)


# ==============================
# 4️⃣ SVM Classifier
# ==============================
class SVMSpamClassifier:
    def __init__(self, k, learning_rate=0.001, epochs=1000, lambda_param=0.01):
        self.k = k
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.lambda_param = lambda_param
        self.weights = None
        self.bias = None

    def train(self, X_train, y_train):
        y_train = np.where(y_train <= 0, -1, 1)
        n_features = X_train.shape[1]
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.epochs):
            for idx, x_i in enumerate(X_train):
                condition = y_train[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights - np.dot(x_i, y_train[idx]))
                    self.bias -= self.learning_rate * y_train[idx]

    def predict(self, X_test):
        linear_output = np.dot(X_test, self.weights) - self.bias
        return np.where(linear_output >= 0, 1, 0)


# ==============================
# 5️⃣ Neural Network Classifier
# ==============================
class NeuralNetworkSpamClassifier:
    def __init__(self, k, learning_rate=0.1, epochs=1000, hidden_size=10):
        self.k = k
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.hidden_size = hidden_size

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def train(self, X_train, y_train):
        n_features = X_train.shape[1]
        self.weights_input_hidden = np.random.randn(n_features, self.hidden_size) * 0.01
        self.weights_hidden_output = np.random.randn(self.hidden_size, 1) * 0.01

        for _ in range(self.epochs):
            hidden_layer = self.sigmoid(np.dot(X_train, self.weights_input_hidden))
            output_layer = self.sigmoid(np.dot(hidden_layer, self.weights_hidden_output))

            error = y_train.reshape(-1, 1) - output_layer
            output_delta = error * self.sigmoid_derivative(output_layer)
            hidden_delta = output_delta.dot(self.weights_hidden_output.T) * self.sigmoid_derivative(hidden_layer)

            self.weights_hidden_output += hidden_layer.T.dot(output_delta) * self.learning_rate
            self.weights_input_hidden += X_train.T.dot(hidden_delta) * self.learning_rate

    def predict(self, X_test):
        hidden_layer = self.sigmoid(np.dot(X_test, self.weights_input_hidden))
        output_layer = self.sigmoid(np.dot(hidden_layer, self.weights_hidden_output))
        return np.where(output_layer >= 0.5, 1, 0).flatten()


# ==============================
# LOAD SPAM DATA (SIMULATED FOR NOW)
# ==============================
np.random.seed(42)
X = np.random.rand(1000, 55)  # 1000 samples, 20 features
y = np.random.randint(0, 2, 1000)

split_index = int(0.8 * len(X))
X_train, X_val = X[:split_index], X[split_index:]
y_train, y_val = y[:split_index], y[split_index:]

X_test = np.random.rand(200, 55)
y_test = np.random.randint(0, 2, 200)


# ==============================
# HYPERPARAMETER SETTINGS
# ==============================
learning_rates = [0.001, 0.01, 0.1]
epochs_list = [500, 1000, 1500]


# ==============================
# RUN AND DISPLAY ACCURACY FOR EACH MODEL
# ==============================

print("\n🔍 Running Hyperparameter Tuning for Each Model...\n")

# 1️⃣ XGBoost
hyperparameter_tuning(XGBoostSpamClassifier, X_train, y_train, X_val, y_val, learning_rates, epochs_list)

# 2️⃣ LightGBM
hyperparameter_tuning(LightGBMSpamClassifier, X_train, y_train, X_val, y_val, learning_rates, epochs_list)

# 3️⃣ CatBoost
hyperparameter_tuning(CatBoostSpamClassifier, X_train, y_train, X_val, y_val, learning_rates, epochs_list)

# 4️⃣ SVM
hyperparameter_tuning(SVMSpamClassifier, X_train, y_train, X_val, y_val, learning_rates, epochs_list)

# 5️⃣ Neural Network
hyperparameter_tuning(NeuralNetworkSpamClassifier, X_train, y_train, X_val, y_val, learning_rates, epochs_list)



🔍 Running Hyperparameter Tuning for Each Model...

XGBoostSpamClassifier - LR: 0.001, Epochs: 500, Accuracy: 0.3900, Time: 0.05s
XGBoostSpamClassifier - LR: 0.001, Epochs: 1000, Accuracy: 0.3900, Time: 0.11s
XGBoostSpamClassifier - LR: 0.001, Epochs: 1500, Accuracy: 0.3900, Time: 0.15s
XGBoostSpamClassifier - LR: 0.01, Epochs: 500, Accuracy: 0.5150, Time: 0.04s
XGBoostSpamClassifier - LR: 0.01, Epochs: 1000, Accuracy: 0.5150, Time: 0.09s
XGBoostSpamClassifier - LR: 0.01, Epochs: 1500, Accuracy: 0.5150, Time: 0.14s
XGBoostSpamClassifier - LR: 0.1, Epochs: 500, Accuracy: 0.5150, Time: 0.05s
XGBoostSpamClassifier - LR: 0.1, Epochs: 1000, Accuracy: 0.5150, Time: 0.09s
XGBoostSpamClassifier - LR: 0.1, Epochs: 1500, Accuracy: 0.5150, Time: 0.14s
Best Params for XGBoostSpamClassifier: {'learning_rate': 0.01, 'epochs': 500}, Best Accuracy: 0.5150
LightGBMSpamClassifier - LR: 0.001, Epochs: 500, Accuracy: 0.3900, Time: 0.05s
LightGBMSpamClassifier - LR: 0.001, Epochs: 1000, Accuracy: 0.3900, T

({'learning_rate': 0.001, 'epochs': 500}, 0.61)