In [4]:
# @title 1. Setup, Data Loading & Cleaning (Logistic Regression for SST-2)
import os
import re
import time
import psutil
import joblib
import numpy as np
import pandas as pd
from google.colab import drive
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

# 1. Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/SST2_LogisticRegression'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# 2. Load Dataset (SST-2 from GLUE)
print("--- Loading SST-2 Dataset ---")
dataset = load_dataset("glue", "sst2")

# 3. Clean Text Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Xóa URL
    text = re.sub(r'<br\s*/>', ' ', text) # Xóa HTML
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("--- Cleaning Dataset ---")
# SST-2 dùng cột 'sentence'
# Train set
X_train_raw = [clean_text(item['sentence']) for item in dataset['train']]
y_train = dataset['train']['label']

# Validation set (Dùng làm test set vì GLUE test set không có nhãn)
X_test_raw = [clean_text(item['sentence']) for item in dataset['validation']]
y_test = dataset['validation']['label']

print(f"Data Prepared: {len(X_train_raw)} Train samples, {len(X_test_raw)} Validation samples")
print("Phần 1: Sẵn sàng!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Loading SST-2 Dataset ---
--- Cleaning Dataset ---
Data Prepared: 67349 Train samples, 872 Validation samples
Phần 1: Sẵn sàng!


In [5]:
# @title 2. Training & Evaluation (Logistic Regression)

# 1. Define Pipeline
# - TfidfVectorizer: Chuyển text sang vector (50k features)
# - LogisticRegression: Mô hình phân lớp
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000, ngram_range=(1, 2))),
    ('clf', LogisticRegression(C=1.0, random_state=42, max_iter=1000, solver='liblinear'))
    # solver='liblinear' thường tốt cho dataset nhỏ/vừa và binary classification
])

# 2. Training
print("--- Starting Training (Logistic Regression on SST-2) ---")
start_train_time = time.time()

pipeline.fit(X_train_raw, y_train)

end_train_time = time.time()
training_time = end_train_time - start_train_time
print(f"Training completed in: {training_time:.2f} seconds")

# 3. Save Model
print(f"Saving model to {SAVE_PATH}...")
model_file = os.path.join(SAVE_PATH, 'logreg_sst2_model.joblib')
joblib.dump(pipeline, model_file)
print("Model saved successfully!")

# 4. Evaluation on Validation Set
print("\n--- Running Evaluation on Validation Set ---")

start_pred_time = time.time()
# Dự đoán nhãn
predictions = pipeline.predict(X_test_raw)
# Dự đoán xác suất (Logistic Regression hỗ trợ predict_proba)
# Lấy cột 1 là xác suất của lớp Positive
probs = pipeline.predict_proba(X_test_raw)[:, 1]
end_pred_time = time.time()

# 5. Calculate Metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='binary')
acc = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probs)

# 6. Efficiency Metrics
total_samples = len(y_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# Model Size
model_size = os.path.getsize(model_file) / (1024 * 1024) # MB

# RAM Usage
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2

# 7. Report
print("\n====== REPORT: Logistic Regression on SST-2 ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {acc:.4f}")
print(f"   - Precision: {precision:.4f}")
print(f"   - Recall:    {recall:.4f}")
print(f"   - F1-Score:  {f1:.4f}")
print(f"   - ROC-AUC:   {roc_auc:.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Model Size (Disk):  {model_size:.2f} MB")
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")

# Save Results CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1", "ROC-AUC",
               "Training Time (s)", "Inference Latency (ms)", "Model Size (MB)",
               "Peak RAM (MB)"],
    "Value": [acc, precision, recall, f1, roc_auc,
              training_time, latency_per_sample, model_size, ram_usage]
})
results_file = os.path.join(SAVE_PATH, 'sst2_logreg_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nBáo cáo đầy đủ đã được lưu tại: {results_file}")

--- Starting Training (Logistic Regression on SST-2) ---
Training completed in: 3.04 seconds
Saving model to /content/drive/My Drive/SLM_Research/SST2_LogisticRegression...
Model saved successfully!

--- Running Evaluation on Validation Set ---

1. Classification Metrics:
   - Accuracy:  0.8085
   - Precision: 0.7879
   - Recall:    0.8536
   - F1-Score:  0.8195
   - ROC-AUC:   0.9084

2. Efficiency Metrics:
   - Training Time:      3.04 s
   - Inference Latency:  0.0515 ms/sample
   - Model Size (Disk):  2.33 MB
   - Peak RAM Usage:     353.57 MB

Báo cáo đầy đủ đã được lưu tại: /content/drive/My Drive/SLM_Research/SST2_LogisticRegression/sst2_logreg_results.csv
