In [5]:
from datasets import load_dataset

ds = load_dataset("Pulk17/Fake-News-Detection-dataset")


In [6]:
print(ds)
print(ds['train'][0])


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label'],
        num_rows: 30000
    })
})
{'Unnamed: 0': 2619, 'title': "Ex-CIA head says Trump remarks on Russia interference 'disgraceful'", 'text': 'Former CIA director John Brennan on Friday criticized as “disgraceful” President Donald Trump’s efforts to play down U.S. intelligence agencies’ assessment that Russia meddled in the 2016 U.S. election. Trump’s administration has been dogged by investigations into allegations of Russian interference in last year’s U.S. presidential election and possible ties with his campaign team. Speaking one day before his first meeting with Russian President Vladimir Putin in Hamburg earlier this month, Trump said he suspected Russian interference in the election but that no one knows for sure. “These types of comments are just disgraceful ... and the person who said them should be ashamed of himself,” said Brennan, CIA chief under former Presid

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Convert HuggingFace dataset to pandas
df = pd.DataFrame(ds['train'])

# Split manually
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(train_df.shape, test_df.shape)


(24000, 6) (6000, 6)


In [9]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['clean_text'])

X_train = tokenizer.texts_to_sequences(train_df['clean_text'])
X_test = tokenizer.texts_to_sequences(test_df['clean_text'])

X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

y_train = train_df['label'].values
y_test = test_df['label'].values


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [12]:
history = model.fit(X_train, y_train,
                    epochs=5,
                    batch_size=64,
                    validation_data=(X_test, y_test))


Epoch 1/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 549ms/step - accuracy: 0.7094 - loss: 0.5508 - val_accuracy: 0.8987 - val_loss: 0.3033
Epoch 2/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 547ms/step - accuracy: 0.9131 - loss: 0.2827 - val_accuracy: 0.9493 - val_loss: 0.1848
Epoch 3/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 553ms/step - accuracy: 0.9333 - loss: 0.2259 - val_accuracy: 0.9467 - val_loss: 0.1949
Epoch 4/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 542ms/step - accuracy: 0.9495 - loss: 0.1918 - val_accuracy: 0.9515 - val_loss: 0.1838
Epoch 5/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 548ms/step - accuracy: 0.9368 - loss: 0.2135 - val_accuracy: 0.9335 - val_loss: 0.2039


In [13]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = (model.predict(X_test) > 0.5).astype("int32")

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 131ms/step
Accuracy: 0.9335
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      3137
           1       0.90      0.97      0.93      2863

    accuracy                           0.93      6000
   macro avg       0.93      0.94      0.93      6000
weighted avg       0.94      0.93      0.93      6000



In [14]:
def predict_fake_news(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    pred = model.predict(pad)[0][0]
    return "Fake News" if pred > 0.5 else "Real News"

print(predict_fake_news("Breaking: NASA confirms water on Mars!"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Fake News
