RANDOM FOREST

In [6]:
import csv
import random
import math
from collections import Counter

# Load dataset from CSV
def load_dataset(file_path):
    data = []
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            text = row['Message']
            label = 1 if row['Category'].lower() == 'spam' else 0  # Spam = 1, Ham = 0
            data.append((text.lower().split(), label))
    return data

# Convert text to feature vector
def preprocess(data):
    vocab = set()
    processed_data = []
    for tokens, label in data:
        vocab.update(tokens)
        processed_data.append((tokens, label))
    return processed_data, list(vocab)

def text_to_features(tokens, vocab):
    token_set = set(tokens)
    return [1 if word in token_set else 0 for word in vocab]

def split_data(data, split_ratio=0.8):
    random.shuffle(data)
    split_index = int(len(data) * split_ratio)
    return data[:split_index], data[split_index:]

# Decision Tree
class DecisionTree:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, data, features):
        self.tree = self._build_tree(data, features, depth=0)

    def _build_tree(self, data, features, depth):
        labels = [label for _, label in data]
        if len(set(labels)) == 1 or depth == self.max_depth:
            return Counter(labels).most_common(1)[0][0]

        best_feature = random.choice(features)
        left_split = [(x, y) for x, y in data if x[best_feature] == 1]
        right_split = [(x, y) for x, y in data if x[best_feature] == 0]

        if not left_split or not right_split:
            return Counter(labels).most_common(1)[0][0]

        return {
            "feature": best_feature,
            "left": self._build_tree(left_split, features, depth + 1),
            "right": self._build_tree(right_split, features, depth + 1),
        }

    def predict(self, x):
        node = self.tree
        while isinstance(node, dict):
            if x[node["feature"]] == 1:
                node = node["left"]
            else:
                node = node["right"]
        return node

# Random Forest
class RandomForest:
    def __init__(self, n_trees, max_depth):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.trees = []

    def fit(self, data):
        for _ in range(self.n_trees):
            bootstrap_sample = [random.choice(data) for _ in range(len(data))]
            features = random.sample(range(len(data[0][0])), int(math.sqrt(len(data[0][0]))))
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(bootstrap_sample, features)
            self.trees.append(tree)

    def predict(self, x):
        votes = [tree.predict(x) for tree in self.trees]
        return Counter(votes).most_common(1)[0][0]

# Load and preprocess data
file_path = "/content/mail_data.csv"
raw_data = load_dataset(file_path)
processed_data, vocab = preprocess(raw_data)
features_data = [(text_to_features(tokens, vocab), label) for tokens, label in processed_data]

# Split into training and testing
train_data, test_data = split_data(features_data)

# Train Random Forest
forest = RandomForest(n_trees=10, max_depth=3)
forest.fit(train_data)

# Evaluate on test data
correct = 0
for x, y in test_data:
    if forest.predict(x) == y:
        correct += 1

accuracy = correct / len(test_data)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.87


LOGISTIC REGRESSION

In [8]:
# Re-importing required libraries and reloading the dataset
import pandas as pd
import numpy as np
import re

# Reloading the dataset
file_path = '/content/mail_data.csv'
data = pd.read_csv(file_path)

# Step 1: Preprocess text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Split text into words (tokenization)
    return text.split()

data['Processed_Message'] = data['Message'].apply(preprocess_text)

# Step 2: Encode labels
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

# Step 3: Create a bag-of-words representation
# Get a unique vocabulary from all messages
vocabulary = list(set(word for message in data['Processed_Message'] for word in message))
vocab_size = len(vocabulary)

# Create a word-to-index mapping
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Convert messages to a bag-of-words representation
def message_to_vector(message):
    vector = np.zeros(vocab_size)
    for word in message:
        if word in word_to_index:
            vector[word_to_index[word]] += 1
    return vector

# Apply to all messages
X = np.array(data['Processed_Message'].apply(message_to_vector).tolist())
y = data['Category'].values

# Display dimensions of X and y
X.shape, y.shape


# Step 4: Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Initialize weights and bias
weights = np.zeros(X_train.shape[1])
bias = 0
learning_rate = 0.01
num_iterations = 1000

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Logistic regression training
for i in range(num_iterations):
    # Compute predictions
    linear_model = np.dot(X_train, weights) + bias
    predictions = sigmoid(linear_model)

    # Compute gradients
    error = predictions - y_train
    dw = np.dot(X_train.T, error) / len(y_train)
    db = np.sum(error) / len(y_train)

    # Update weights and bias
    weights -= learning_rate * dw
    bias -= learning_rate * db

    # Optionally, calculate and print loss every 100 iterations
    if i % 100 == 0:
        loss = -np.mean(y_train * np.log(predictions) + (1 - y_train) * np.log(1 - predictions))
        print(f"Iteration {i}, Loss: {loss}")

# Step 6: Make predictions on the test set
linear_model_test = np.dot(X_test, weights) + bias
test_predictions = sigmoid(linear_model_test)
test_predictions = [1 if pred > 0.5 else 0 for pred in test_predictions]

# Step 7: Evaluate accuracy
accuracy = np.mean(test_predictions == y_test)
print(f"Test Accuracy: {accuracy}")


Iteration 0, Loss: 0.6931471805599453
Iteration 100, Loss: 0.5243828056531046
Iteration 200, Loss: 0.45309680502872984
Iteration 300, Loss: 0.4123740933931833
Iteration 400, Loss: 0.38423801769637306
Iteration 500, Loss: 0.36243978814891836
Iteration 600, Loss: 0.34439024208107427
Iteration 700, Loss: 0.3288658163361045
Iteration 800, Loss: 0.31521682437359166
Iteration 900, Loss: 0.3030565574937204
Test Accuracy: 0.8771300448430494


NAIVE BAYE'S

In [11]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

# Step 1: Load the dataset
file_path = '/content/mail_data.csv'
data = pd.read_csv(file_path)

# Step 2: Preprocess text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Split text into words
    return text.split()

# Apply preprocessing
data['Processed_Message'] = data['Message'].apply(preprocess_text)
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

# Step 3: Separate spam and ham messages
spam_messages = data[data['Category'] == 1]['Processed_Message']
ham_messages = data[data['Category'] == 0]['Processed_Message']

# Step 4: Calculate word probabilities for each class
spam_word_counts = defaultdict(int)
ham_word_counts = defaultdict(int)
spam_total_words = 0
ham_total_words = 0

# Count word frequencies in spam and ham messages
for message in spam_messages:
    for word in message:
        spam_word_counts[word] += 1
        spam_total_words += 1

for message in ham_messages:
    for word in message:
        ham_word_counts[word] += 1
        ham_total_words += 1

# Calculate class probabilities
p_spam = len(spam_messages) / len(data)
p_ham = len(ham_messages) / len(data)

# Vocabulary size (for Laplace smoothing)
vocabulary = set(word for message in data['Processed_Message'] for word in message)
vocab_size = len(vocabulary)

# Step 5: Define a function to calculate likelihoods
def calculate_likelihood(message, class_word_counts, class_total_words, class_prob):
    likelihood = np.log(class_prob)  # Start with log of prior probability
    for word in message:
        # Apply Laplace smoothing
        word_prob = (class_word_counts[word] + 1) / (class_total_words + vocab_size)
        likelihood += np.log(word_prob)
    return likelihood

# Step 6: Predict function
def predict(message):
    # Preprocess message
    message = preprocess_text(message)
    # Calculate likelihoods for both classes
    spam_likelihood = calculate_likelihood(message, spam_word_counts, spam_total_words, p_spam)
    ham_likelihood = calculate_likelihood(message, ham_word_counts, ham_total_words, p_ham)
    # Return the class with the higher likelihood
    return 1 if spam_likelihood > ham_likelihood else 0

# Step 7: Evaluate accuracy
data['Prediction'] = data['Message'].apply(predict)
accuracy = np.mean(data['Prediction'] == data['Category'])
print(f"Accuracy: {accuracy}")


Accuracy: 0.9890524048815507
