In [1]:
# @title 1. Setup, Data Loading & Cleaning (SVM Baseline for SST-2)
import os
import re
import time
import psutil
import joblib # Dùng để lưu model sklearn
import numpy as np
import pandas as pd
from google.colab import drive
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

# 1. Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/SST2_SVM'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# 2. Load Dataset (SST-2 from GLUE benchmark)
print("--- Loading SST-2 Dataset ---")
dataset = load_dataset("glue", "sst2")

# 3. Clean Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Xóa URL
    text = re.sub(r'<br\s*/>', ' ', text) # Xóa HTML
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("--- Cleaning Dataset ---")
# SST-2 sử dụng cột 'sentence' chứa văn bản
# Lưu ý: Tập 'test' của GLUE không có nhãn (label=-1), nên ta dùng 'validation' để đánh giá
X_train_raw = [clean_text(item['sentence']) for item in dataset['train']]
y_train = dataset['train']['label']

X_test_raw = [clean_text(item['sentence']) for item in dataset['validation']]
y_test = dataset['validation']['label']

print(f"Data Prepared: {len(X_train_raw)} Train samples, {len(X_test_raw)} Validation samples")

Mounted at /content/drive
--- Loading SST-2 Dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

--- Cleaning Dataset ---
Data Prepared: 67349 Train samples, 872 Validation samples


In [2]:
# @title 2. Training & Evaluation (SVM + TF-IDF)

# 1. Define Pipeline
# - TfidfVectorizer: Chuyển text sang vector (giữ nguyên config 50k features)
# - LinearSVC: Mô hình SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000, ngram_range=(1, 2))),
    ('clf', LinearSVC(C=1.0, random_state=42, dual='auto'))
])

# 2. Training
print("--- Starting Training (SVM on SST-2) ---")
start_train_time = time.time()

# Train trên tập train
pipeline.fit(X_train_raw, y_train)

end_train_time = time.time()
training_time = end_train_time - start_train_time
print(f"Training completed in: {training_time:.2f} seconds")

# 3. Save Model
print(f"Saving model to {SAVE_PATH}...")
model_file = os.path.join(SAVE_PATH, 'svm_sst2_model.joblib')
joblib.dump(pipeline, model_file)
print("SVM Model saved successfully!")

# 4. Evaluation on Validation Set
print("\n--- Running Evaluation on Validation Set ---")

start_pred_time = time.time()
# Dự đoán nhãn
predictions = pipeline.predict(X_test_raw)
# Dự đoán điểm số cho ROC-AUC
decision_scores = pipeline.decision_function(X_test_raw)
end_pred_time = time.time()

# 5. Calculate Metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='binary')
acc = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, decision_scores)

# 6. Efficiency Metrics
total_samples = len(y_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# Model Size
model_size = os.path.getsize(model_file) / (1024 * 1024) # MB

# RAM Usage
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2

# 7. Report
print("\n====== REPORT: SVM (Baseline) on SST-2 ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {acc:.4f}")
print(f"   - Precision: {precision:.4f}")
print(f"   - Recall:    {recall:.4f}")
print(f"   - F1-Score:  {f1:.4f}")
print(f"   - ROC-AUC:   {roc_auc:.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Model Size (Disk):  {model_size:.2f} MB")
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")

# Save Results CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Model Size (MB)"],
    "Value": [acc, f1, roc_auc, training_time, latency_per_sample, model_size]
})
results_df.to_csv(os.path.join(SAVE_PATH, 'sst2_svm_results.csv'), index=False)
print(f"\nReport saved to {os.path.join(SAVE_PATH, 'sst2_svm_results.csv')}")

--- Starting Training (SVM on SST-2) ---
Training completed in: 2.91 seconds
Saving model to /content/drive/My Drive/SLM_Research/SST2_SVM...
SVM Model saved successfully!

--- Running Evaluation on Validation Set ---

1. Classification Metrics:
   - Accuracy:  0.8188
   - Precision: 0.8043
   - Recall:    0.8514
   - F1-Score:  0.8271
   - ROC-AUC:   0.9072

2. Efficiency Metrics:
   - Training Time:      2.91 s
   - Inference Latency:  0.0519 ms/sample
   - Model Size (Disk):  2.33 MB
   - Peak RAM Usage:     365.36 MB

Report saved to /content/drive/My Drive/SLM_Research/SST2_SVM/sst2_svm_results.csv
