# Dissertation Notebook
End-to-end pipeline for fake news detection using classical and deep learning models.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Loading

In [None]:
fake = pd.read_csv('data/Fake.csv.zip')
true = pd.read_csv('data/True.csv.zip')
fake['label'] = 0
true['label'] = 1
df = pd.concat([fake, true]).reset_index(drop=True)
df.head()

## Exploratory Data Analysis

In [None]:
df.info()
df['label'].value_counts()
df.isna().sum()

## Text Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)

## Feature Extraction and Classical Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)
clf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('lr', LogisticRegression(max_iter=1000))
])
clf_pipeline.fit(X_train, y_train)
clf_preds = clf_pipeline.predict(X_test)
print(classification_report(y_test, clf_preds))

## Deep Learning Model

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
max_len = 100
X_train_seq = pad_sequences(train_sequences, maxlen=max_len)
X_test_seq = pad_sequences(test_sequences, maxlen=max_len)

model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=max_len),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_seq, y_train, epochs=2, validation_split=0.2)
deep_preds = (model.predict(X_test_seq) > 0.5).astype('int32')
print(classification_report(y_test, deep_preds))

## Export Pipelines

In [None]:
import joblib
clf_pipeline.fit(df['clean_text'], df['label'])
joblib.dump(clf_pipeline, 'models/log_reg_pipeline.joblib')
model.save('models/lstm_model.keras')
print('Pipelines saved!')