 # Fake news detector

In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import classification_report, accuracy_score

import nltk
from nltk.corpus import stopwords

In [2]:
fake = pd.read_csv('./Fake.csv')
real = pd.read_csv('./True.csv')

In [3]:
fake["Type"] = "1"
real["Type"] = "0"

In [4]:
fake.head()

Unnamed: 0,title,text,subject,date,Type
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [5]:
data = pd.concat([real, fake])
data = data.sample(frac=1)

In [6]:
import re

def clean_text(text):
    text = text.lower()
    
    # 3. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 4. Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data["title"] = data["title"].apply(clean_text)
data["text"] = data["text"].apply(clean_text)
data["subject"] = data["subject"].apply(clean_text)

In [9]:
X = data["title"] + " " + data["text"] + " " + data["subject"]
Y = data['Type']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
text_model  = TfidfVectorizer()
X_train_tfidf = text_model.fit_transform(X_train)
X_test_tfidf = text_model.transform(X_test)

### Using sklearn's Logistic Regression without Pipelines

In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

### With Pipelines 

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\nFinal evaluation on test set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Tensorflow implementation

In [12]:
model_tf = tf.keras.Sequential([
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.0003)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.0006)),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])

In [13]:
model_tf.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [14]:
# Convert TF-IDF to dense and cast to float32
X_train_dense = X_train_tfidf.toarray().astype(np.float32)
X_test_dense = X_test_tfidf.toarray().astype(np.float32)

# Ensure labels are float32 NumPy arrays
y_train = y_train.astype(np.float32).values
y_test = y_test.astype(np.float32).values

In [15]:
print(y_train)

[1. 0. 0. ... 1. 1. 1.]


In [None]:
X_train_dense = X_train_tfidf.toarray()

predictor_tf = model_tf.fit(
    X_train_tfidf,
    y_train,
    epochs=200,                   
    validation_split=0.2,
    verbose=1, 
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)]
)

Epoch 1/100
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 134ms/step - accuracy: 0.9312 - loss: 0.3904 - val_accuracy: 0.9954 - val_loss: 0.1017
Epoch 2/100
[1m121/449[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m40s[0m 124ms/step - accuracy: 0.9985 - loss: 0.0939

KeyboardInterrupt: 

In [None]:
print("Model training finished.")

print("Final validation accuracy:", predictor_tf.history['val_accuracy'][-1])

import matplotlib.pyplot as plt

# Visual clarification on models preformance
plt.plot(predictor_tf.history['loss'], label='Train Loss')
plt.plot(predictor_tf.history['val_loss'], label='Val Loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Curve')
plt.show()

In [None]:
predictions_prob =  linear_model.predict(test_labels)
predictions = (predictions_prob > 0.5).astype(int).flatten()