In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import string
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from sklearn.metrics import confusion_matrix,accuracy_score, f1_score, precision_score,recall_score,classification_report
import time
from sklearn.svm import SVC
from google.colab import drive
import pickle
from scipy.sparse import csr_matrix

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
class SVM:
    def __init__(self, lambda_param=1e-4, epoch=1000, batch_size=256, tol=1e-4, random_state=42):
        self.lambda_param = lambda_param
        self.epoch = epoch
        self.batch_size = batch_size
        self.tol = tol
        self.random_state = random_state
        self.is_trained = False

    def fit(self, X, y):
        if hasattr(X, "toarray"):
            X = csr_matrix(X)
        
        self.num_samples, self.num_features = X.shape

        y_unique = np.unique(y)
        if len(y_unique) != 2:
            raise ValueError("Phân loại nhị phân cần 2 nhãn")
        if set(y_unique) == {0, 1}:
            y = np.where(y == 0, -1, 1)
        
        self.w = np.zeros(self.num_features, dtype=np.float32)
        self.b = 0.0

        np.random.seed(self.random_state)
        t = 0
        previous_objective = float("inf")

        for ep in range(1, self.epoch + 1):
            indices = np.random.permutation(self.num_samples)
            for start in range(0, self.num_samples, self.batch_size):
                t += 1
                end = start + self.batch_size
                batch_idx = indices[start:end]
                X_batch = X[batch_idx]
                y_batch = y[batch_idx]
                
                eta = 1.0 / (self.lambda_param * t)
                margins = y_batch * (X_batch.dot(self.w) + self.b)
                mask = margins < 1
                self.w *= (1 - eta * self.lambda_param)
                if np.any(mask):
                    X_violate = X_batch[mask]
                    y_violate = y_batch[mask]
                    self.w += (eta / self.batch_size) * np.dot(y_violate, X_violate.toarray() if hasattr(X_violate, "toarray") else X_violate)
                    self.b += (eta / self.batch_size) * np.sum(y_violate)
                norm_w = np.linalg.norm(self.w)
                factor = min(1, (1.0 / np.sqrt(self.lambda_param)) / (norm_w))
                self.w *= factor

            decision = X.dot(self.w) + self.b
            hinge_losses = np.maximum(0, 1 - y * decision)
            objective = 0.5 * self.lambda_param * np.dot(self.w, self.w) + np.mean(hinge_losses)
            
            if ep % 10 == 0:
                print(f"Epoch {ep}, Giá trị hàm mục tiêu: {objective:.4f}")
            
            if abs(previous_objective - objective) < self.tol:
                print(f"Dừng sớm tại epoch {ep}, giá trị hàm mục tiêu thay đổi: {abs(previous_objective - objective):.6f}")
                break
            previous_objective = objective

        self.is_trained = True
        return self

    def predict(self, X):
        if not self.is_trained:
            raise Exception("Mô hình chưa được huấn luỵen")
            
        if hasattr(X, "toarray"):
            X = csr_matrix(X)
            
        decision = X.dot(self.w) + self.b
        return np.where(decision >= 0, 1, 0)


In [3]:
a = [i for i in range(1000, 10000, 500)]
def calc_time_svm():
    for i in a:
        df = pd.read_csv(f"https://media.githubusercontent.com/media/PTIT-Projects/ttcs-svm-spam-email/refs/heads/main/dataset/sampled_dataset{i}.csv")
        print("Start preprocessing data")
        prep_start_time = time.time()
        df['text'] = df['text'].str.lower()
        
        # xoa ky tu khong phai ASCII
        df['text'] = df['text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x) if isinstance(x, str) else x)
    
        # xoa khoang trang
        df['text'] = df['text'].apply(lambda x: re.sub(r'^\s+|\s+$', '', x).strip() if isinstance(x, str) else x)
    
        # xoa html, xml
        def remove_html_xml(text):
            try:
                soup = BeautifulSoup(text, 'html.parser')
                return soup.get_text()
            except:
                return text
        df['text'] = df['text'].apply(remove_html_xml)
        def remove_special_characters(word):
            return word.translate(str.maketrans('', '', string.punctuation))
        df['text'] = df['text'].apply(remove_special_characters)
        def remove_urls(text):
            return re.sub(r'http\S+|www\S+|\S+\.(com|net|org|edu|gov|mil|int|info|biz|co)\S+', '', text)
        df['text'] = df['text'].apply(remove_urls)
        def remove_emails(text):
            return re.sub(r'\S+@\S+', '', text)
        df['text'] = df['text'].apply(remove_emails)
        df['text'] = df['text'].apply(word_tokenize)
        ENGLISH_STOP_WORDS = set(stopwords.words('english'))
        def remove_stop_words(words):
            return [word for word in words if word not in ENGLISH_STOP_WORDS]
        df['text'] = df['text'].apply(remove_stop_words)
        stemmer = PorterStemmer()
        def stem_words(words):
            return [stemmer.stem(word) for word in words]
        df['text'] = df['text'].apply(stem_words)
        df['text'] = df['text'].apply(' '.join)
        X = df['text']
        y = df['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
        hashing_vectorizer = TfidfVectorizer()
        X_train_hashed = hashing_vectorizer.fit_transform(X_train)
        X_test_hashed = hashing_vectorizer.transform(X_test)
        print(X_train_hashed.shape)
        print("End preprocessing data")
        prep_end_time = time.time()
        svm_base = SVM()
        start_time = time.time()
        svm_base.fit(X_train_hashed, y_train)
        end_time = time.time()
        y_pred = svm_base.predict(X_test_hashed)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        result = {
        'class_name': svm_base.__class__.__name__,
        'n': len(df),
        'time': end_time - start_time,
        'prep_time': prep_end_time - prep_start_time ,
        'accuracy_score': accuracy,
        'f1_score': f1
        }
        results_df = pd.DataFrame([result])
        print(results_df)
        results_df.to_csv(f'svm_pegasos_{i}.csv', index=False)

In [4]:
calc_time_svm()

Start preprocessing data
(800, 15517)
End preprocessing data
Epoch 10, Giá trị hàm mục tiêu: 32.6296
Epoch 20, Giá trị hàm mục tiêu: 1.1482
Epoch 30, Giá trị hàm mục tiêu: 1.6400
Epoch 40, Giá trị hàm mục tiêu: 1.4125
Epoch 50, Giá trị hàm mục tiêu: 0.4692
Epoch 60, Giá trị hàm mục tiêu: 0.3155
Epoch 70, Giá trị hàm mục tiêu: 0.2233
Epoch 80, Giá trị hàm mục tiêu: 0.2225
Epoch 90, Giá trị hàm mục tiêu: 0.1400
Epoch 100, Giá trị hàm mục tiêu: 0.6319
Epoch 110, Giá trị hàm mục tiêu: 0.1472
Epoch 120, Giá trị hàm mục tiêu: 0.1190
Epoch 130, Giá trị hàm mục tiêu: 0.1022
Epoch 140, Giá trị hàm mục tiêu: 0.0883
Epoch 150, Giá trị hàm mục tiêu: 0.0779
Epoch 160, Giá trị hàm mục tiêu: 0.1303
Epoch 170, Giá trị hàm mục tiêu: 0.0809
Epoch 180, Giá trị hàm mục tiêu: 0.0704
Epoch 190, Giá trị hàm mục tiêu: 0.1427
Epoch 200, Giá trị hàm mục tiêu: 0.0667
Epoch 210, Giá trị hàm mục tiêu: 0.0611
Epoch 220, Giá trị hàm mục tiêu: 0.0556
Epoch 230, Giá trị hàm mục tiêu: 0.5965
Epoch 240, Giá trị hàm mục 