In [1]:
# =====================================================
# BASELINE: SVM (LinearSVC) - WELFake
# Output Format: HuggingFace Style
# =====================================================

import os, re, psutil, pickle, time
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. MOUNT DRIVE & SETUP
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/WELFake_SVM_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. LOAD & CLEAN DATA
print("‚è≥ ƒêang t·∫£i dataset WELFake...")
dataset = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset["train"])

def clean_text_ml(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r'[^a-z0-9\s]', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ Pre-processing...")
df['content'] = (df['title'].fillna('') + " " + df['text'].fillna('')).apply(clean_text_ml)
# L·ªçc b·ªè m·∫´u qu√° ng·∫Øn
df = df[df['content'].str.len() > 50]

# 3. SPLIT
print("‚úÇÔ∏è Chia t·∫≠p d·ªØ li·ªáu...")
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# 4. TF-IDF
print("‚öôÔ∏è Vectorizing (TF-IDF)...")
# SVM ho·∫°t ƒë·ªông t·ªët v·ªõi s·ªë chi·ªÅu l·ªõn, ta c√≥ th·ªÉ tƒÉng max_features l√™n m·ªôt ch√∫t n·∫øu mu·ªën
vectorizer = TfidfVectorizer(max_features=50000, stop_words='english', ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# 5. TRAIN SVM
print("üöÄ Training SVM (LinearSVC)...")
# LinearSVC nhanh h∆°n nhi·ªÅu so v·ªõi SVC(kernel='linear') cho text classification
# Ch√∫ng ta b·ªçc trong CalibratedClassifierCV ƒë·ªÉ c√≥ th·ªÉ t√≠nh x√°c su·∫•t (predict_proba) cho AUC
svm_model = LinearSVC(class_weight='balanced', random_state=42, max_iter=1000)
clf = CalibratedClassifierCV(svm_model)
clf.fit(X_train, y_train)

# =====================================================
# 6. EVALUATION (HuggingFace Style Format)
# =====================================================
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")

start_time = time.time()

# D·ª± ƒëo√°n
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1] # L·∫•y x√°c su·∫•t l·ªõp 1 (Real)

end_time = time.time()
runtime = end_time - start_time
samples_per_second = len(y_test) / runtime

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_prob)

eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A (SVM)',
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second,
    'eval_steps_per_second': 'N/A'
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ SVM BASELINE:")
print("="*50)
print(eval_results)
print("="*50)

# 7. SAVE
print(f"\nüíæ ƒêang l∆∞u model v√†o {OUTPUT_DIR}...")
with open(os.path.join(OUTPUT_DIR, "svm_model.pkl"), "wb") as f:
    pickle.dump(clf, f)
with open(os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
print("‚úÖ Ho√†n t·∫•t!")

Mounted at /content/drive
‚è≥ ƒêang t·∫£i dataset WELFake...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-290868f0a36350(‚Ä¶):   0%|          | 0.00/152M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/72134 [00:00<?, ? examples/s]

üßπ Pre-processing...
‚úÇÔ∏è Chia t·∫≠p d·ªØ li·ªáu...
‚öôÔ∏è Vectorizing (TF-IDF)...
üöÄ Training SVM (LinearSVC)...

üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ SVM BASELINE:
{'eval_accuracy': 0.976549996520771, 'eval_precision': 0.9765563445613139, 'eval_recall': 0.976549996520771, 'eval_f1': 0.9765486227470757, 'eval_auc': np.float64(0.9967980311318841), 'eval_loss': 'N/A (SVM)', 'eval_runtime': 0.1357572078704834, 'eval_samples_per_second': 105858.09936302152, 'eval_steps_per_second': 'N/A'}

üíæ ƒêang l∆∞u model v√†o /content/drive/MyDrive/WELFake_SVM_Baseline...
‚úÖ Ho√†n t·∫•t!
