In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [3]:
import zipfile

z = zipfile.ZipFile('archive (12).zip')

z.extractall()

In [4]:
train_data = pd.read_csv(r"/content/train (2).csv",sep=";")
train_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [5]:
#EDA
train_data.shape

(24353, 4)

In [6]:
df1 = train_data.copy()

In [7]:
# Checking the NULL Values in Training data
df1.isna().sum()


Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [8]:
# Data splitting
x = df1.drop(['Unnamed: 0','label'],axis=1)
y = df1['label']

In [9]:

from tensorflow.keras.layers import Embedding,Dense,Dropout,LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

In [10]:
# Vocbulary size
voc_size = 5000

In [11]:
messages = x.copy()
messages.reset_index(inplace=True)

In [12]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
#Preprocessing of Data
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus= []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
# Check corpus
for i in range(0,10):
    print(i,corpus[i])

0 palestinian switch christma light bethlehem anti trump protest
1 china say trump call taiwan presid chang island statu
2 fail trump organ credit score make laugh
3 zimbabw militari chief china trip normal visit beij say
4 uncourag presid ever receiv courag award proce whine current presid
5 suspect boko haram suicid bomber kill least nigeria offici
6 watch john oliv present gop debat clowntown f ck world sh tshow
7 senat democrat ask trump attorney gener pick recus russia probe
8 trump humili republican latest hissi fit side democrat debt ceil
9 maci get boot loyal custom fire trump


In [15]:
# Onehot Representation of these corpus words
onehot_rep = [one_hot(words,voc_size) for words in corpus]
onehot_rep[0:5]

[[2630, 4792, 4136, 4000, 4898, 2964, 2752, 2288],
 [4315, 2115, 2752, 2261, 4852, 113, 4690, 4083, 2212],
 [3789, 2752, 4420, 437, 890, 2381, 4791],
 [744, 3980, 1320, 4315, 1436, 1388, 2432, 518, 2115],
 [762, 113, 3755, 4140, 4896, 1280, 4516, 914, 3128, 113]]

In [16]:
col_length = 100
embedded_rep = pad_sequences(onehot_rep,padding='pre',maxlen=col_length)
print(embedded_rep)

[[   0    0    0 ... 2964 2752 2288]
 [   0    0    0 ... 4690 4083 2212]
 [   0    0    0 ...  890 2381 4791]
 ...
 [   0    0    0 ... 3343 2752  732]
 [   0    0    0 ... 1323 2846 2612]
 [   0    0    0 ... 4502 3106 2406]]


In [17]:
len(embedded_rep)

24353

In [18]:
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Bidirectional

In [19]:
embedding_features = 40
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=embedding_features))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64,dropout=0.5)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid',kernel_regularizer=regularizers.l2(0.01)))

model.build(input_shape=(None, col_length))  # ✅ build manually for summary
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [20]:
x_final = np.array(embedded_rep)
y_final =np.array(y)

In [21]:
x_final.shape,y_final.shape

((24353, 100), (24353,))

In [22]:
x_train,x_test,y_train,y_test = train_test_split(x_final,y_final,test_size=0.3,random_state=42)

In [23]:
# Final Training of MODEL
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(x_train,y_train,epochs=10,validation_data=(x_test,y_test),callbacks=[early_stop])

Epoch 1/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 17ms/step - accuracy: 0.7313 - loss: 0.5604 - val_accuracy: 0.9076 - val_loss: 0.2327
Epoch 2/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - accuracy: 0.9165 - loss: 0.2249 - val_accuracy: 0.9083 - val_loss: 0.2321
Epoch 3/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - accuracy: 0.9311 - loss: 0.1889 - val_accuracy: 0.9105 - val_loss: 0.2441
Epoch 4/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - accuracy: 0.9488 - loss: 0.1516 - val_accuracy: 0.9164 - val_loss: 0.2569
Epoch 5/10
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.9549 - loss: 0.1277 - val_accuracy: 0.9138 - val_loss: 0.2592


<keras.src.callbacks.history.History at 0x7ca016ac1c90>

In [24]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}, Loss: {loss:.4f}")

[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9073 - loss: 0.2350
Test Accuracy: 0.9083, Loss: 0.2321


In [25]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
[[3072  210]
 [ 460 3564]]
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3282
           1       0.94      0.89      0.91      4024

    accuracy                           0.91      7306
   macro avg       0.91      0.91      0.91      7306
weighted avg       0.91      0.91      0.91      7306



In [26]:

# 🔹 Preprocessing + Prediction Function
def predict_news(text, model, voc_size=5000, col_length=20):
    # Step 1: Clean the text
    review = re.sub('[^a-zA-Z]', ' ', text)   # keep only letters
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    processed_text = ' '.join(review)

    # Step 2: One-hot encode
    onehot_repr = [one_hot(processed_text, voc_size)]

    # Step 3: Pad sequence
    embedded = pad_sequences(onehot_repr, padding='pre', maxlen=col_length)

    # Step 4: Predict
    prediction = model.predict(embedded)[0][0]

    # Step 5: Return result
    if prediction < 0.5:
        return f"Prediction: FAKE ({prediction:.2f})"
    else:
        return f"Prediction: REAL ({prediction:.2f})"


In [27]:
print(predict_news("Breaking news: Government launches new scheme for farmers", model))
print(predict_news("Shocking! Aliens spotted in New York City!!!", model))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 380ms/step
Prediction: FAKE (0.04)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Prediction: FAKE (0.24)


In [28]:
test_cases = [
    "The Prime Minister announced a new healthcare policy today.",   # looks REAL
    "Aliens were spotted walking in Times Square last night!",       # looks FAKE
    "Apple Inc. reported a record profit for the last quarter.",     # looks REAL
    "Scientists confirm that drinking coffee makes you immortal.",   # looks FAKE
    "The stock market closed higher after tech companies rallied.",  # looks REAL
    "A man claims he can talk to dogs using telepathy.",             # looks FAKE
]

for news in test_cases:
    print(news)
    print(predict_news(news, model))
    print("-" * 50)


The Prime Minister announced a new healthcare policy today.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Prediction: REAL (0.89)
--------------------------------------------------
Aliens were spotted walking in Times Square last night!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Prediction: FAKE (0.03)
--------------------------------------------------
Apple Inc. reported a record profit for the last quarter.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Prediction: FAKE (0.13)
--------------------------------------------------
Scientists confirm that drinking coffee makes you immortal.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Prediction: FAKE (0.45)
--------------------------------------------------
The stock market closed higher after tech companies rallied.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Prediction: REAL (0.93)
---------------------