# Fake News Detection - Training Pipeline

This notebook contains the complete machine learning pipeline for classifying news articles as 'Real' or 'Fake'. 

### Objectives:
1. **Preprocessing**: Clean the raw text data.
2. **EDA**: Perform Basic Exploratory Data Analysis.
3. **Model Selection**: Train and evaluate multiple models (Logistic Regression, Decision Tree).
4. **Deployment**: Save the best performing models for use in the Streamlit application.

### 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import os
import joblib
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# fix SSL for NLTK downloads on macOS
import ssl
try:
    ssl._create_default_https_context = ssl._create_unverified_context
except:
    pass

# downloading required nltk data
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# setting up lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

### 2. Preprocessing Function

In [None]:
def preprocess_text(text):
    """this function takes raw text and returns clean preprocessed text"""
    
    if not isinstance(text, str):
        return ""

    # converting to lowercase
    text = text.lower()

    # removing special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # tokenizing the text into words
    tokens = word_tokenize(text)

    # removing stopwords and applying lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # joining tokens back into a string
    return ' '.join(cleaned_tokens)

### 3. Data Loading

In [None]:
print("Loading dataset...")
df = pd.read_csv('data/news_cleaned.csv')
print(f"Dataset loaded successfully! Shape: {df.shape}")

### 4. Dataset Statistics

In [None]:
print("=" * 50)
print("DATASET STATISTICS")
print("=" * 50)
print(f"Total samples: {len(df)}")
print()
print("Class distribution:")
print(f"  Fake (0): {len(df[df['label'] == 0])} articles")
print(f"  Real (1): {len(df[df['label'] == 1])} articles")
print()
print(f"Average article length: {df['text'].str.len().mean():.0f} characters")

### 5. Text Preprocessing

Running the preprocessing on the entire dataset. This might take a few minutes depending on the data size.

In [None]:
print("Starting text preprocessing...")
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("Preprocessing done!")

# saving preprocessed data
df.to_csv('data/news_preprocessed.csv', index=False)
print("Preprocessed data saved to data/news_preprocessed.csv")

### 6. Exploratory Data Analysis (EDA)

In [None]:
print("=" * 50)
print("EXPLORATORY DATA ANALYSIS")
print("=" * 50)

# text length distribution after preprocessing
lengths = df['cleaned_text'].str.len()
print("Text length stats (after preprocessing):")
print(f"  Mean: {lengths.mean():.0f}")
print(f"  Median: {lengths.median():.0f}")
print(f"  Min: {lengths.min()}")
print(f"  Max: {lengths.max()}")
print()

# top 20 most frequent words
all_words = ' '.join(df['cleaned_text']).split()
word_counts = Counter(all_words)
print("Top 20 most frequent words:")
for word, count in word_counts.most_common(20):
    print(f"  {word}: {count}")

### 7. Train-Test Split

In [None]:
X_text_train, X_text_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.2, random_state=42
)

print(f"Training set: {len(X_text_train)} samples")
print(f"Testing set: {len(X_text_test)} samples")

### 8. Model Training & Evaluation

In [None]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n--- {name} ---")
    print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_true, y_pred):.4f}")
    print(f"Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

print("=" * 50)
print("MODEL TRAINING")
print("=" * 50)

# 1. Logistic Regression Pipeline
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', LogisticRegression(max_iter=1000))
])

print("\nTraining Pipeline: TF-IDF → Logistic Regression...")
pipeline_lr.fit(X_text_train, y_train)
y_pred_lr = pipeline_lr.predict(X_text_test)
evaluate_model("Pipeline: Logistic Regression + TF-IDF", y_test, y_pred_lr)

# 2. Decision Tree Pipeline
pipeline_dt = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

print("\nTraining Pipeline: TF-IDF → Decision Tree...")
pipeline_dt.fit(X_text_train, y_train)
y_pred_dt = pipeline_dt.predict(X_text_test)
evaluate_model("Pipeline: Decision Tree + TF-IDF", y_test, y_pred_dt)

### 9. Saving Models

In [None]:
os.makedirs('models', exist_ok=True)

# Save primary model and vectorizer for deployment
joblib.dump(pipeline_lr, 'models/model.pkl')
joblib.dump(pipeline_lr.named_steps['tfidf'], 'models/vectorizer.pkl')

# Save individual pipelines for reference
joblib.dump(pipeline_lr, 'models/pipeline_lr_tfidf.pkl')
joblib.dump(pipeline_dt, 'models/pipeline_dt_tfidf.pkl')

print("All models saved successfully to models/ directory!")