 # Fake news detector

In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import classification_report, accuracy_score

import nltk
from nltk.corpus import stopwords

In [2]:
fake = pd.read_csv('./Fake.csv')
real = pd.read_csv('./True.csv')

In [3]:
fake["Type"] = "1"
real["Type"] = "0"

In [4]:
fake.head()

Unnamed: 0,title,text,subject,date,Type
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [5]:
data = pd.concat([real, fake])
data = data.sample(frac=1)

In [6]:
import re

def clean_text(text):
    text = text.lower()
    
    # 3. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 4. Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data["title"] = data["title"].apply(clean_text)
data["text"] = data["text"].apply(clean_text)
data["subject"] = data["subject"].apply(clean_text)

In [9]:
X = data["title"] + " " + data["text"] + " " + data["subject"]
Y = data['Type']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
text_model  = TfidfVectorizer()
X_train_tfidf = text_model.fit_transform(X_train)
X_test_tfidf = text_model.transform(X_test)

### Using sklearn's Logistic Regression without Pipelines

In [12]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [13]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9927616926503341
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4327
           1       0.99      0.99      0.99      4653

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



### With Pipelines 

In [14]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

In [15]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\nFinal evaluation on test set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Final evaluation on test set:
Accuracy: 0.9943207126948775
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4327
           1       1.00      0.99      0.99      4653

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



### Tensorflow implementation

In [16]:
model_tf = tf.keras.Sequential([
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.0003)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.0006)),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])

In [17]:
model_tf.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [18]:
# Convert TF-IDF to dense and cast to float32
X_train_dense = X_train_tfidf.toarray().astype(np.float32)
X_test_dense = X_test_tfidf.toarray().astype(np.float32)

# Ensure labels are float32 NumPy arrays
y_train = y_train.astype(np.float32).values
y_test = y_test.astype(np.float32).values

In [19]:
print(y_train)

[1. 1. 0. ... 0. 0. 0.]


In [20]:
X_train_dense = X_train_tfidf.toarray()

predictor_tf = model_tf.fit(
    X_train_tfidf,
    y_train,
    epochs=100,                   
    validation_split=0.2,
    verbose=0, 
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)]
)

KeyboardInterrupt: 

In [None]:
print("Model training finished.")

# Whilst the accuracy againts the test data is ~90%, the score of only 78% does suggest this model is prone to overfitting 
print("Final validation accuracy:", predictor_tf.history['val_accuracy'][-1])

import matplotlib.pyplot as plt

# Visual clarification on models preformance
plt.plot(predictor_tf.history['loss'], label='Train Loss')
plt.plot(predictor_tf.history['val_loss'], label='Val Loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Curve')
plt.show()

In [None]:
predictions_prob =  linear_model.predict(test_labels)
predictions = (predictions_prob > 0.5).astype(int).flatten()

In [2]:
tf.debugging.set_log_device_placement(True)

# Try allocating a tensor
with tf.device('/GPU:0'):
    a = tf.constant([[1.0, 2.0]])
    b = tf.constant([[3.0], [4.0]])
    print(tf.matmul(a, b))

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _MklMatMul in device /job:localhost/replica:0/task:0/device:CPU:0
tf.Tensor([[11.]], shape=(1, 1), dtype=float32)


In [1]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 0


In [7]:
import tensorflow as tf

# List physical devices
gpus = tf.config.list_physical_devices('GPU')
print("GPUs detected:", gpus)

# Check if built with CUDA support
print("Is TensorFlow built with CUDA:", tf.test.is_built_with_cuda())

# Check if a GPU device is available
print("Is GPU available:", tf.test.is_gpu_available())


GPUs detected: []
Is TensorFlow built with CUDA: False
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available: False
