In [None]:
# ml_modeling.py
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

def load_data():
    """Load the cleaned dataset"""
    print("Loading cleaned dataset...")
    df = pd.read_csv('cleaned_news_dataset.csv')
    print(f"✅ Dataset loaded: {df.shape}")
    return df

def prepare_features(df):
    """Prepare features and target variable"""
    X = df['combined_text']
    y = df['label']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"📊 Data split:")
    print(f"   Training set: {X_train.shape[0]:,} samples")
    print(f"   Test set: {X_test.shape[0]:,} samples")
    print(f"   Fake news in train: {y_train.value_counts()[0]:,}")
    print(f"   True news in train: {y_train.value_counts()[1]:,}")
    
    return X_train, X_test, y_train, y_test

def train_and_evaluate_models(X_train, y_train):
    """Train and evaluate multiple ML models"""
    
    # Define models with their pipelines
    models = {
        'Logistic Regression': Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')),
            ('lr', LogisticRegression(random_state=42, max_iter=1000))
        ]),
        'Random Forest': Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')),
            ('rf', RandomForestClassifier(random_state=42, n_estimators=100))
        ]),
       
    }
    
    # Evaluate models with cross-validation
    results = {}
    print("\n🤖 Training and evaluating models...")
    print("="*50)
    
    for name, pipeline in models.items():
        # Cross-validation scores
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')
        
        # Train the model
        pipeline.fit(X_train, y_train)
        
        results[name] = {
            'pipeline': pipeline,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'cv_scores': cv_scores
        }
        
        print(f"{name:20} | F1 Score: {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")
    
    return results

def select_best_model(results, X_train, y_train):
    """Select the best performing model"""
    best_model_name = max(results.keys(), key=lambda x: results[x]['cv_mean'])
    best_model = results[best_model_name]['pipeline']
    
    print(f"\n🏆 Best Model: {best_model_name}")
    print(f"🏆 Cross-validation F1 Score: {results[best_model_name]['cv_mean']:.4f}")
    
    return best_model, best_model_name

def hyperparameter_tuning(X_train, y_train):
    """Perform hyperparameter tuning for the best model"""
    print("\n🎯 Performing hyperparameter tuning...")
    
    # Parameter grid for Logistic Regression
    param_grid = {
        'tfidf__max_features': [3000, 5000, 7000],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'lr__C': [0.1, 1.0, 10.0],
        'lr__penalty': ['l2']  # l1 requires solver like liblinear
    }
    
    # Create pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('lr', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'))
    ])
    
    # Grid search
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=5, 
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"✅ Best parameters: {grid_search.best_params_}")
    print(f"✅ Best CV score: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

def evaluate_final_model(model, X_test, y_test):
    """Evaluate the final model on test set"""
    print("\n📊 Final Model Evaluation on Test Set")
    print("="*50)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    print(f"\n📋 Detailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Fake', 'True']))
    
    return y_pred, y_pred_proba

def save_model(model, model_name):
    """Save the trained model"""
    filename = f'{model_name.replace(" ", "_").lower()}_model.pkl'
    joblib.dump(model, filename)
    print(f"✅ Model saved as: {filename}")
    return filename

def main():
    """Main function for ML modeling"""
    print("=" * 60)
    print("🤖 MACHINE LEARNING MODELING")
    print("=" * 60)
    
    # Load and prepare data
    df = load_data()
    X_train, X_test, y_train, y_test = prepare_features(df)
    
    # Train and evaluate models
    results = train_and_evaluate_models(X_train, y_train)
    
    # Select best model
    best_model, best_model_name = select_best_model(results, X_train, y_train)
    
    # Hyperparameter tuning
    tuned_model = hyperparameter_tuning(X_train, y_train)
    
    # Evaluate both models
    print("\n" + "="*50)
    print("Original Best Model:")
    y_pred_orig, y_proba_orig = evaluate_final_model(best_model, X_test, y_test)
    
    print("\n" + "="*50)
    print("Tuned Model:")
    y_pred_tuned, y_proba_tuned = evaluate_final_model(tuned_model, X_test, y_test)
    
    # Save models
    orig_model_file = save_model(best_model, f"original_{best_model_name}")
    tuned_model_file = save_model(tuned_model, f"tuned_{best_model_name}")
    
    print(f"\n✅ Modeling completed!")
    print(f"   Original model: {orig_model_file}")
    print(f"   Tuned model: {tuned_model_file}")
    
    return tuned_model, X_test, y_test

if __name__ == "__main__":
    model, X_test, y_test = main()