# Fake News Detection - Model Training Pipeline

This notebook demonstrates the complete training pipeline for the fake news detection model.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add src directory to path
sys.path.append('../src')

# Import our custom classes
from preprocessor import Preprocessor
from feature_extractor import FeatureExtractor
from model_trainer import ModelTrainer


## 1. Load Dataset


In [None]:
# Load the dataset
# Note: Replace 'your_dataset.csv' with your actual dataset filename
dataset_path = '../data/raw/your_dataset.csv'

try:
    df = pd.read_csv(dataset_path)
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df.head())
except FileNotFoundError:
    print(f"Dataset not found at {dataset_path}")
    print("Please download a dataset and place it in the data/raw/ directory")
    print("Expected columns: 'text' and 'label'")
    print("\nExample dataset structure:")
    print("text,label")
    print("This is a fake news article,1")
    print("This is a real news article,0")


## 2. Data Preprocessing


In [None]:
# Initialize preprocessor
preprocessor = Preprocessor()

# Apply preprocessing to the text column
print("Preprocessing text data...")
df['cleaned_text'] = df['text'].apply(preprocessor.clean_text)

print("Preprocessing completed!")
print(f"\nOriginal text example:")
print(df['text'].iloc[0])
print(f"\nCleaned text example:")
print(df['cleaned_text'].iloc[0])


## 3. Train-Test Split


In [None]:
# Split the data into training and testing sets
X = df['cleaned_text'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training labels distribution: {np.bincount(y_train)}")
print(f"Test labels distribution: {np.bincount(y_test)}")


## 4. Feature Extraction


In [None]:
# Initialize feature extractor
feature_extractor = FeatureExtractor(method='tfidf')

# Fit and transform training data
print("Generating features for training data...")
X_train_features = feature_extractor.generate_features(X_train, fit=True)

# Transform test data (don't fit on test data)
print("Generating features for test data...")
X_test_features = feature_extractor.generate_features(X_test, fit=False)

print(f"Training features shape: {X_train_features.shape}")
print(f"Test features shape: {X_test_features.shape}")

# Save the fitted vectorizer
vectorizer_path = '../models/tfidf_vectorizer.pkl'
os.makedirs('../models', exist_ok=True)
joblib.dump(feature_extractor.vectorizer, vectorizer_path)
print(f"Vectorizer saved to {vectorizer_path}")


## 5. Model Training


In [None]:
# Initialize model trainer
# Note: For TF-IDF features, we need to adjust the model architecture
# Since TF-IDF gives us dense features, we'll use a simpler architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Create a simpler model for TF-IDF features
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_features.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("Model architecture:")
model.summary()


In [None]:
# Train the model
print("Training the model...")
history = model.fit(
    X_train_features, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Save the trained model
model_path = '../models/fake_news_model.h5'
model.save(model_path)
print(f"Model saved to {model_path}")


## 6. Model Evaluation


In [None]:
# Make predictions on test set
y_pred_proba = model.predict(X_test_features)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Real News', 'Fake News']))

# Print confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real News', 'Fake News'],
            yticklabels=['Real News', 'Fake News'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
