https://www.kaggle.com/code/vucongtuanduong/time-hardmargin-svm-tfidf/output

In [None]:
import pandas as pd
import nltk
import numpy as np
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import SnowballStemmer
import time
import pickle

#Visualization
import matplotlib.pyplot as plt

#Feature Engineering
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
#Evaluation Metric
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score, precision_score,recall_score,classification_report
import seaborn as sns
from scipy.sparse import csr_matrix

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
class HardMarginSVM:
    """
    Optimized Hard Margin SVM implementation using gradient descent

    Attributes
    -------------
    eta : float
        Learning rate
    epoch : int
        Number of epochs
    random_state : int
        Random seed
    is_trained : bool
        Training completion flag
    num_samples : int
        Number of training samples
    num_features : int
        Number of features
    w : NDArray[float]
        Parameter vector: (num_features, ) ndarray
    b : float
        Bias parameter
    alpha : NDArray[float]
        Lagrange multipliers: (num_samples, ) ndarray
    """
    def __init__(self, eta=0.001, epoch=1000, random_state=42):
        self.eta = eta
        self.epoch = epoch
        self.random_state = random_state
        self.is_trained = False
        self.support_vectors = None

    def fit(self, X, y):
        """
        Fit parameter vector to training data

        Parameters
        --------------
        X : NDArray[NDArray[float]]
            Training data: (num_samples, num_features) matrix
        y : NDArray[float]
            Training labels: (num_samples) ndarray
        """
        # Convert sparse matrix to dense if needed
        if hasattr(X, "toarray"):
            X = X.toarray()

        self.num_samples = X.shape[0]
        self.num_features = X.shape[1]

        y_unique = np.unique(y)
        if len(y_unique) != 2:
            raise ValueError("Binary classification requires exactly 2 classes")

        if set(y_unique) == {0, 1}:
            y = np.where(y == 0, -1, 1)

        self.w = np.zeros(self.num_features)
        self.b = 0


        rgen = np.random.RandomState(self.random_state)
        self.alpha = rgen.uniform(low=0.0, high=0.01, size=self.num_samples)
        for i in range(self.epoch):
            self._cycle(X, y)

        sv_indices = np.where(self.alpha != 0)[0]

        self.support_vectors = sv_indices

        self.w = np.zeros(self.num_features)
        for i in sv_indices:
            self.w += self.alpha[i] * y[i] * X[i]

        bias_sum = 0
        for i in sv_indices:
            bias_sum += y[i] - np.dot(self.w, X[i])

        self.b = bias_sum / len(sv_indices)

        self.is_trained = True
        return self

    def predict(self, X):
        """
        Return predictions

        Parameters
        --------------
        X : NDArray[NDArray[float]]
            Data to classify: (any, num_features) matrix

        Returns
        ----------
        result : NDArray[int]
            Classification results 0 or 1: (any, ) ndarray
        """
        if not self.is_trained:
            raise Exception('Model not trained yet')

        # Convert sparse matrix to dense if needed
        if hasattr(X, "toarray"):
            X = X.toarray()

        decision_values = X @ self.w + self.b

        result = np.where(decision_values >= 0, 1, 0)
        return result

    def _cycle(self, X, y):
        """
        One gradient descent cycle

        Parameters
        --------------
        X : NDArray[NDArray[float]]
            Training data: (num_samples, num_features) matrix
        y : NDArray[float]
            Training labels: (num_samples) ndarray
        """
        y = y.reshape([-1, 1])

        XXT = X @ X.T
        H = (y @ y.T) * XXT

        grad = np.ones(self.num_samples) - H @ self.alpha

        self.alpha += self.eta * grad

        self.alpha = np.clip(self.alpha, 0, None)


In [None]:
a = [i for i in range(1000, 10000, 500)]
print(a)

[1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500]


In [None]:
def calc_time_svm():
  results = []
  for i in a:
    df = pd.read_csv(f"https://media.githubusercontent.com/media/PTIT-Assignment-Projects/ai-svm-email-spam/refs/heads/main/dataset/sampled_dataset{i}.csv")
    def remove_special_characters(word):
        return re.sub(r'[^a-zA-Z\s]', '', word)
    ENGLISH_STOP_WORDS = set(stopwords.words('english'))
    def remove_stop_words(words):
        return [word for word in words if word not in ENGLISH_STOP_WORDS]
    def remove_url(word):
        return re.sub(r"http\S+", "", word)
    df['text'] = df['text'].apply(remove_special_characters)
    df['text'] = df['text'].apply(remove_url)
    df['text'] = df['text'].apply(word_tokenize)
    df['text'] = df['text'].apply(remove_stop_words)
    df['text'] = df['text'].apply(' '.join)
    stemmer = SnowballStemmer('english')
    def stem_text(text):
        tokens = nltk.word_tokenize(text)
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return ' '.join(stemmed_tokens)
    df['text'] = df['text'].apply(stem_text)
    # dataset trainning voi test
    X = df['text']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # tfidf
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # # hashing_vectorizer
    # hashing_vectorizer = HashingVectorizer(n_features=5000)
    # X_train_hashed = hashing_vectorizer.fit_transform(X_train)
    # X_test_hashed = hashing_vectorizer.transform(X_test)
    X_train_dense = X_train_tfidf.toarray()
    X_test_dense = X_test_tfidf.toarray()
    result = []
    svm_base = HardMarginSVM()
    start_time = time.time()
    svm_base.fit(X_train_dense, y_train)
    end_time = time.time()
    y_pred = svm_base.predict(X_test_dense)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    result.append({
            'class_name': svm_base.__class__.__name__,
            'time': end_time - start_time,
            'accuracy_score': accuracy,
            'f1_score': f1
        })
    results_df = pd.DataFrame(result)
    print(results_df)
    results_df.to_csv(f'hardmargin_svm_tfidf_{i}.csv', index=False)
  return results

In [None]:
calc_time_svm()

      class_name        time  accuracy_score  f1_score
0  HardMarginSVM  188.271099           0.945   0.94686


      class_name        time  accuracy_score  f1_score
0  HardMarginSVM  483.728427            0.96  0.963415


      class_name         time  accuracy_score  f1_score
0  HardMarginSVM  1025.564333            0.96  0.961353


      class_name         time  accuracy_score  f1_score
0  HardMarginSVM  1769.558942           0.976  0.976378


      class_name         time  accuracy_score  f1_score
0  HardMarginSVM  2834.332839        0.966667  0.966887


      class_name         time  accuracy_score  f1_score
0  HardMarginSVM  4327.781785        0.967143  0.969935


      class_name         time  accuracy_score  f1_score
0  HardMarginSVM  6007.968148         0.96375  0.967306


      class_name         time  accuracy_score  f1_score
0  HardMarginSVM  8247.298119        0.962222  0.963907


      class_name          time  accuracy_score  f1_score
0  HardMarginSVM  10635.539112           0.969  0.970837
