In [2]:
# =====================================================
# BASELINE: LOGISTIC REGRESSION - WELFAKE
# Output Format: HuggingFace Style
# =====================================================

import os, re, psutil, pickle, time
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from google.colab import drive

# 1. MOUNT DRIVE & SETUP
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError: pass

OUTPUT_DIR = "/content/drive/MyDrive/WELFake_LogReg_Baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2. LOAD & CLEAN DATA
print("‚è≥ ƒêang t·∫£i dataset WELFake...")
dataset = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset["train"])

def clean_text_ml(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r'[^a-z0-9\s]', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ Pre-processing...")
df['content'] = (df['title'].fillna('') + " " + df['text'].fillna('')).apply(clean_text_ml)
df = df[df['content'].str.len() > 50]

# 3. SPLIT
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# 4. TF-IDF
print("‚öôÔ∏è Vectorizing (TF-IDF)...")
vectorizer = TfidfVectorizer(max_features=50000, stop_words='english', ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# 5. TRAIN
print("üöÄ Training Logistic Regression...")
model = LogisticRegression(solver='liblinear', C=1.0, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# =====================================================
# 6. EVALUATION (HuggingFace Style Format)
# =====================================================
print("\nüéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...")

# B·∫Øt ƒë·∫ßu ƒëo th·ªùi gian d·ª± ƒëo√°n
start_time = time.time()

# D·ª± ƒëo√°n nh√£n v√† x√°c su·∫•t
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] # L·∫•y x√°c su·∫•t l·ªõp 1 (Real)

# K·∫øt th√∫c ƒëo th·ªùi gian
end_time = time.time()
runtime = end_time - start_time
samples_per_second = len(y_test) / runtime

# T√≠nh to√°n c√°c ch·ªâ s·ªë
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_prob)

# T·∫°o dictionary k·∫øt qu·∫£ gi·ªëng HuggingFace
eval_results = {
    'eval_accuracy': accuracy,
    'eval_precision': precision,
    'eval_recall': recall,
    'eval_f1': f1,
    'eval_auc': auc,
    'eval_loss': 'N/A (LogReg)', # Logistic Regression sklearn kh√¥ng tr·∫£ v·ªÅ loss theo epoch nh∆∞ DL
    'eval_runtime': runtime,
    'eval_samples_per_second': samples_per_second,
    'eval_steps_per_second': 'N/A' # Kh√¥ng √°p d·ª•ng cho sklearn
}

print("\n" + "="*50)
print("üìä K·∫æT QU·∫¢ CU·ªêI C√ôNG:")
print("="*50)
print(eval_results)
print("="*50)

# 7. SAVE
with open(os.path.join(OUTPUT_DIR, "logreg_model.pkl"), "wb") as f:
    pickle.dump(model, f)
with open(os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
print(f"\n‚úÖ ƒê√£ l∆∞u model t·∫°i: {OUTPUT_DIR}")

‚è≥ ƒêang t·∫£i dataset WELFake...
üßπ Pre-processing...
‚öôÔ∏è Vectorizing (TF-IDF)...
üöÄ Training Logistic Regression...





üéØ ƒêANG ƒê√ÅNH GI√Å (TEST SET)...

üìä K·∫æT QU·∫¢ CU·ªêI C√ôNG:
{'eval_accuracy': 0.961310973488275, 'eval_precision': 0.9613805053894593, 'eval_recall': 0.961310973488275, 'eval_f1': 0.9613027825808775, 'eval_auc': np.float64(0.9934692191873555), 'eval_loss': 'N/A (LogReg)', 'eval_runtime': 0.01752948760986328, 'eval_samples_per_second': 819818.6005113976, 'eval_steps_per_second': 'N/A'}

‚úÖ ƒê√£ l∆∞u model t·∫°i: /content/drive/MyDrive/WELFake_LogReg_Baseline
