In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import words

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("saurabhshahane/fake-news-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/saurabhshahane/fake-news-classification?dataset_version_number=77...


100%|██████████| 92.1M/92.1M [00:03<00:00, 25.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/saurabhshahane/fake-news-classification/versions/77


In [None]:
!kaggle datasets download -d saurabhshahane/fake-news-classification

Dataset URL: https://www.kaggle.com/datasets/saurabhshahane/fake-news-classification
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading fake-news-classification.zip to /content
 77% 71.0M/92.1M [00:00<00:00, 254MB/s]
100% 92.1M/92.1M [00:00<00:00, 251MB/s]


In [None]:
!unzip fake-news-classification.zip

Archive:  fake-news-classification.zip
  inflating: WELFake_Dataset.csv     


In [None]:
df = pd.read_csv ("/content/WELFake_Dataset.csv")


In [None]:
df.shape

(72134, 4)

In [None]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1


In [None]:
train_df = df[["text", "label"]]

In [None]:
train_df.head()

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,1
1,Did they post their votes for Hillary already?,1
2,"Now, most of the demonstrators gathered last ...",1
3,A dozen politically active pastors came here f...,0
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1



# Preprocessing:


In [None]:
train_df.isna().sum()

Unnamed: 0,0
text,39
label,0


In [None]:
train_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.dropna(inplace=True)


In [None]:
train_df.isna().sum()

Unnamed: 0,0
text,0
label,0


In [None]:
train_df["Text_arr"] = train_df["text"].apply(word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["Text_arr"] = train_df["text"].apply(word_tokenize)


In [None]:
nltk.download('words')
english_words = set(words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
train_df["Text_arr"] = train_df["Text_arr"].apply(lambda x: np.array([word for word in x if word in english_words]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["Text_arr"] = train_df["Text_arr"].apply(lambda x: np.array([word for word in x if word in english_words]))


In [None]:
train_df.head(3)

Unnamed: 0,text,label,Text_arr,text_wostop
0,No comment is expected from Barack Obama Membe...,1,"[No, comment, is, from, of, the, or, and, for,...",N c e n f r f h e r n f r h ...
1,Did they post their votes for Hillary already?,1,"[they, post, their, for, Hillary, already]",h e p h e r f r H l l r l r e
2,"Now, most of the demonstrators gathered last ...",1,"[most, of, the, last, night, were, their, cons...",f h e l n g h w e r e h e r c n ...


In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
train_df["text_wostop"] = train_df["Text_arr"].apply(lambda x: ' '.join([word for word in x if word.casefold() not in stop_words]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["text_wostop"] = train_df["Text_arr"].apply(lambda x: ' '.join([word for word in x if word.casefold() not in stop_words]))


In [None]:
train_df.head(3)

Unnamed: 0,text,label,Text_arr,text_wostop
0,No comment is expected from Barack Obama Membe...,1,"[No, comment, is, from, of, the, or, and, for,...",comment hanging white people radio show Tuesda...
1,Did they post their votes for Hillary already?,1,"[they, post, their, for, Hillary, already]",post Hillary already
2,"Now, most of the demonstrators gathered last ...",1,"[most, of, the, last, night, were, their, cons...",last night constitutional right peaceful prote...


In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train_df["text_wostop"] = train_df["text_wostop"].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["text_wostop"] = train_df["text_wostop"].apply(clean_text)


In [None]:
train_df.head(3)

Unnamed: 0,text,label,Text_arr,text_wostop
0,No comment is expected from Barack Obama Membe...,1,"[No, comment, is, from, of, the, or, and, for,...",comment hanging white people radio show tuesda...
1,Did they post their votes for Hillary already?,1,"[they, post, their, for, Hillary, already]",post hillary already
2,"Now, most of the demonstrators gathered last ...",1,"[most, of, the, last, night, were, their, cons...",last night constitutional right peaceful prote...


In [None]:
stemmer = nltk.SnowballStemmer("english")
def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [None]:
train_df["text_wostop"] = train_df["text_wostop"].apply(stemm_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["text_wostop"] = train_df["text_wostop"].apply(stemm_text)


In [None]:
x = train_df['text_wostop']
y = train_df['label']

print(len(x))

72095


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

54071 54071
18024 18024


# Building Embeddings:


In [None]:
!pip install tensorflow transformers




In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, TFBertForSequenceClassification
from tensorflow.keras.layers import LSTM, Dense, Dropout,InputLayer, Lambda

In [None]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
x_train_tokenized = tokenizer(x_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")
x_test_tokenized = tokenizer(x_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors="tf")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

#LSTM model

In [None]:
model = tf.keras.Sequential([
    InputLayer(input_shape=(x_train_tokenized['input_ids'].shape[1],)),
    Lambda(lambda x: bert_model(x)[0]),
    LSTM(128, activation="tanh", return_sequences=  True),
    Dropout(0.3),
    LSTM(128, activation="tanh"),
    Dense(1, activation='sigmoid')
])



In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(
    {'input_ids': x_train_tokenized['input_ids'], 'attention_mask': x_train_tokenized['attention_mask']},
    y_train,
    epochs=3,
    batch_size=8,
    validation_data=(
        {'input_ids': x_test_tokenized['input_ids'], 'attention_mask': x_test_tokenized['attention_mask']},
        y_test
    )
)


Epoch 1/3
[1m6759/6759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m851s[0m 123ms/step - accuracy: 0.7916 - loss: 0.4469 - val_accuracy: 0.8658 - val_loss: 0.3195
Epoch 2/3
[1m6759/6759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m852s[0m 122ms/step - accuracy: 0.8757 - loss: 0.2973 - val_accuracy: 0.8804 - val_loss: 0.2881
Epoch 3/3
[1m6759/6759[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m862s[0m 122ms/step - accuracy: 0.8959 - loss: 0.2554 - val_accuracy: 0.8879 - val_loss: 0.2853


<keras.src.callbacks.history.History at 0x788999486620>

In [None]:
model.summary()

In [None]:
loss, accuracy = model.evaluate(
    {'input_ids': x_test_tokenized['input_ids'], 'attention_mask': x_test_tokenized['attention_mask']},
    y_test
)

[1m564/564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 293ms/step - accuracy: 0.8800 - loss: 0.2859


In [None]:
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Test Loss: 0.285564124584198
Test Accuracy: 0.8811584711074829
