In [1]:
import pandas as pd
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print("Libraries Loaded Successfully! üöÄ")

Libraries Loaded Successfully! üöÄ


In [2]:
# --- CONFIGURATION ---
models_config = {
    "sms": {
        "path": "dataset/spam.csv",       
        "encoding": "latin-1",
        "sep": ",",
        "col_text": "v2",
        "col_label": "v1",
        "filename": "models/sms_spam_model.pkl"
    },
    "email": {
        "path": "dataset/spam.csv",       # Using SMS data for Email (Safe fallback)
        "encoding": "latin-1",
        "sep": ",",
        "col_text": "v2",
        "col_label": "v1",
        "filename": "models/email_spam_model.pkl"
    },
    "news": {
        "path": "dataset/WELFake_Dataset.csv", 
        "sep": ",",
        "col_text": "title",
        "col_label": "label",
        "filename": "models/fake_news_model.pkl"
    }
}
print("Configuration Set! ‚úÖ")

Configuration Set! ‚úÖ


In [3]:
print("üöÄ Starting Fast Training...")

for key, config in models_config.items():
    print(f"\n------------------------------------------------")
    print(f"üì° Processing Model: {key.upper()}")
    
    if not os.path.exists(config['path']):
        print(f"‚ùå Error: File not found at {config['path']}")
        continue

    try:
        # 1. Load Data
        if "encoding" in config:
            df = pd.read_csv(config['path'], encoding=config['encoding'], sep=config['sep'])
        else:
            df = pd.read_csv(config['path'], sep=config['sep'])

        # --- THE SPEED FIX ---
        # If it is the huge NEWS dataset, only take the first 3000 rows.
        if key == 'news':
            df = df.iloc[:3000]
            print("   ‚ö†Ô∏è TRUNCATED data to 3,000 rows for speed!")
        # ---------------------

        # 2. Prepare Columns
        text_col = config['col_text']
        label_col = config['col_label']
        
        # SMS/Email Standardization
        if key in ['sms', 'email']:
            if 'v1' not in df.columns:
                df.columns = ['v1', 'v2', 'u1', 'u2', 'u3']
            df['label_num'] = df['v1'].map({'ham': 0, 'spam': 1})
            text_col = 'v2'

        # News Standardization
        if key == 'news':
            # WELFake: 1=Real, 0=Fake. We want 0=Safe, 1=Danger.
            df['label_num'] = df[label_col].map({1: 0, 0: 1})

        # 3. Clean
        df = df.dropna(subset=[text_col, 'label_num'])
        X = df[text_col].astype(str)
        y = df['label_num'].astype(int)
        
        # 4. Train (No Split needed for final model, helps speed)
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
            ('svm', SVC(kernel='linear', probability=True))
        ])
        
        print(f"   ‚è≥ Training on {len(df)} rows...")
        pipeline.fit(X, y)
        
        # 5. Save
        joblib.dump(pipeline, config['filename'])
        print(f"   üíæ Saved to {config['filename']}")

    except Exception as e:
        print(f"‚ùå Error: {e}")

print("\n‚ú® All Models Trained Successfully!")

üöÄ Starting Fast Training...

------------------------------------------------
üì° Processing Model: SMS
   ‚è≥ Training on 5572 rows...
   üíæ Saved to models/sms_spam_model.pkl

------------------------------------------------
üì° Processing Model: EMAIL
   ‚è≥ Training on 5572 rows...
   üíæ Saved to models/email_spam_model.pkl

------------------------------------------------
üì° Processing Model: NEWS
   ‚ö†Ô∏è TRUNCATED data to 3,000 rows for speed!
   ‚è≥ Training on 2977 rows...
   üíæ Saved to models/fake_news_model.pkl

‚ú® All Models Trained Successfully!
