# Detection of Fake News via Classification MODEL

## 1.Load and check the Dataset

In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [12]:

# Training data
train_data = pd.read_csv(r"Training-dataset.csv",sep=";")
train_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


#### B) EDA

In [13]:
train_data.shape

(24353, 4)

In [14]:
df1 = train_data.copy()

In [15]:
# Checking the NULL Values in Training data
df1.isna().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

## 2. Splitting in X,Y 

In [16]:
# Data splitting
x = df1.drop(['Unnamed: 0','label'],axis=1)
y = df1['label']

## 3. Data Prepration

#### 3.1 Import of Tensorflow libraries

In [17]:
from tensorflow.keras.layers import Embedding,Dense,Dropout,LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

In [18]:
# Vocbulary size
voc_size = 5000

## 3.2 OneHot Representation

In [19]:
messages = x.copy()
messages.reset_index(inplace=True)

In [21]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.7.34-cp312-cp312-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 4.2 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 4.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 3.6 MB/s eta 0:00:00
Downloading regex-2025.7.34-cp312-cp312-win_amd64.whl (275 kB)
Downloading click-8.2.1-py3-none-any.whl (102 kB)
Installing collected packages: regex, click, nltk
Successfully installed click-8.2.1 nltk-3.9.1 regex-2025.7.34


In [23]:
import nltk
import re
from nltk.corpus import stopwords

In [24]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## **3.3 Preprocessing of Data**

In [25]:
'''It used for stemming text — a key step in Natural Language Processing (NLP),
that helps simplify words by reducing them to their root/base form.'''
from nltk.stem.porter import PorterStemmer


In [27]:
ps = PorterStemmer()
corpus= []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [28]:
# Check corpus
for i in range(0,10):
    print(i,corpus[i])

0 palestinian switch christma light bethlehem anti trump protest
1 china say trump call taiwan presid chang island statu
2 fail trump organ credit score make laugh
3 zimbabw militari chief china trip normal visit beij say
4 uncourag presid ever receiv courag award proce whine current presid
5 suspect boko haram suicid bomber kill least nigeria offici
6 watch john oliv present gop debat clowntown f ck world sh tshow
7 senat democrat ask trump attorney gener pick recus russia probe
8 trump humili republican latest hissi fit side democrat debt ceil
9 maci get boot loyal custom fire trump


In [31]:
# Onehot Representation of these corpus words
onehot_rep = [one_hot(words,voc_size) for words in corpus]
onehot_rep[0:5]

[[3091, 1565, 1538, 2136, 1449, 2971, 3160, 2045],
 [521, 3275, 3160, 1902, 4478, 1076, 2698, 3969, 2748],
 [2718, 3160, 4244, 7, 267, 4011, 2137],
 [3804, 1794, 3902, 521, 784, 1994, 2410, 4187, 3275],
 [4047, 1076, 3516, 2529, 4214, 1059, 4405, 3257, 204, 1076]]

## 3.3 Embedding Representation

In [32]:
col_length = 100
embedded_rep = pad_sequences(onehot_rep,padding='pre',maxlen=col_length)
print(embedded_rep)

[[   0    0    0 ... 2971 3160 2045]
 [   0    0    0 ... 2698 3969 2748]
 [   0    0    0 ...  267 4011 2137]
 ...
 [   0    0    0 ... 1973 3160  261]
 [   0    0    0 ... 4868 2073 4728]
 [   0    0    0 ... 2851 2685 2147]]


In [33]:
len(embedded_rep)

24353

## 4. MODEL Creation

In [34]:
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

In [35]:
embedding_features = 40
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=embedding_features))  
model.add(BatchNormalization())
model.add(LSTM(32,dropout=0.5,recurrent_dropout=0.2))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid',kernel_regularizer=regularizers.l2(0.01)))

model.build(input_shape=(None, col_length))  # ✅ build manually for summary
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [36]:
x_final = np.array(embedded_rep)
y_final =np.array(y)

In [37]:
x_final.shape,y_final.shape

((24353, 100), (24353,))

In [38]:
x_train,x_test,y_train,y_test = train_test_split(x_final,y_final,test_size=0.3,random_state=42)

In [39]:
# Final Training of MODEL
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(x_train,y_train,epochs=10,validation_data=(x_test,y_test),callbacks=[early_stop])

Epoch 1/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 89ms/step - accuracy: 0.6329 - loss: 0.6904 - val_accuracy: 0.8883 - val_loss: 0.2708
Epoch 2/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 99ms/step - accuracy: 0.8987 - loss: 0.2723 - val_accuracy: 0.9054 - val_loss: 0.2421
Epoch 3/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 87ms/step - accuracy: 0.9262 - loss: 0.2081 - val_accuracy: 0.9084 - val_loss: 0.2484
Epoch 4/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 92ms/step - accuracy: 0.9407 - loss: 0.1739 - val_accuracy: 0.9102 - val_loss: 0.2637
Epoch 5/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 86ms/step - accuracy: 0.9456 - loss: 0.1582 - val_accuracy: 0.9049 - val_loss: 0.2884


<keras.src.callbacks.history.History at 0x2044edfec00>

In [40]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}, Loss: {loss:.4f}")


[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9093 - loss: 0.2404
Test Accuracy: 0.9054, Loss: 0.2421


In [41]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step
[[2924  358]
 [ 333 3691]]
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3282
           1       0.91      0.92      0.91      4024

    accuracy                           0.91      7306
   macro avg       0.90      0.90      0.90      7306
weighted avg       0.91      0.91      0.91      7306

