In [39]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the datasets
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

In [3]:
# Add a column to label the data
true_df['label'] = 0
fake_df['label'] = 1

In [4]:
# Combine the datasets
combined_df = pd.concat([true_df, fake_df], ignore_index=True)

# Shuffle the combined dataset (optional)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Save the combined dataset to a new CSV file
combined_df.to_csv('Combined.csv', index=False)

In [5]:
combined_df.shape

(44898, 5)

In [6]:
combined_df.head

<bound method NDFrame.head of                                                    title  \
0      Windows 10 is Stealing Your Bandwidth (You Mig...   
1      Kenya president: elections will go ahead despi...   
2       Cher Does NOT Hold Back On Twitter As She Cal...   
3      U.S.-backed campaign against IS in eastern Syr...   
4      Puerto Rico oversight board orders furloughs, ...   
...                                                  ...   
44893  US Senate Majority Leader: 'Be careful' implem...   
44894  Incoming New Zealand government to review cent...   
44895   Watch Megyn Kelly Prove She Has No Idea How T...   
44896  New Jersey's Murphy echoes Sanders in Democrat...   
44897  Vocal, powerful critic of Wall Street rules le...   

                                                    text       subject  \
0      21st Century Wire says We ve heard a lot of no...       US_News   
1      NAIROBI (Reuters) - Kenyan President Uhuru Ken...     worldnews   
2      When it comes to the

In [7]:
combined_df.describe

<bound method NDFrame.describe of                                                    title  \
0      Windows 10 is Stealing Your Bandwidth (You Mig...   
1      Kenya president: elections will go ahead despi...   
2       Cher Does NOT Hold Back On Twitter As She Cal...   
3      U.S.-backed campaign against IS in eastern Syr...   
4      Puerto Rico oversight board orders furloughs, ...   
...                                                  ...   
44893  US Senate Majority Leader: 'Be careful' implem...   
44894  Incoming New Zealand government to review cent...   
44895   Watch Megyn Kelly Prove She Has No Idea How T...   
44896  New Jersey's Murphy echoes Sanders in Democrat...   
44897  Vocal, powerful critic of Wall Street rules le...   

                                                    text       subject  \
0      21st Century Wire says We ve heard a lot of no...       US_News   
1      NAIROBI (Reuters) - Kenyan President Uhuru Ken...     worldnews   
2      When it comes to

Data Preprocessing
1) Lower Case
2) Removing links
3) Removing next lines (\n)
4) Words containing numbers
5) Extra spaces
6) Special characters
7) Removal of stop words
8) Stemming
9) Lemmatization

In [8]:
#converting column "text" to lower case

combined_df['text'] = combined_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,Windows 10 is Stealing Your Bandwidth (You Mig...,21st century wire says we ve heard a lot of no...,US_News,"April 7, 2016",1
1,Kenya president: elections will go ahead despi...,nairobi (reuters) - kenyan president uhuru ken...,worldnews,"October 10, 2017",0
2,Cher Does NOT Hold Back On Twitter As She Cal...,when it comes to the safety of our drinking wa...,News,"January 7, 2016",1
3,U.S.-backed campaign against IS in eastern Syr...,beirut (reuters) - a u.s.-backed campaign agai...,worldnews,"October 18, 2017",0
4,"Puerto Rico oversight board orders furloughs, ...",(reuters) - puerto rico’s federally appointed ...,politicsNews,"August 4, 2017",0


In [9]:
#Converting "title" column to lower case

combined_df['title'] = combined_df['title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,windows 10 is stealing your bandwidth (you mig...,21st century wire says we ve heard a lot of no...,US_News,"April 7, 2016",1
1,kenya president: elections will go ahead despi...,nairobi (reuters) - kenyan president uhuru ken...,worldnews,"October 10, 2017",0
2,cher does not hold back on twitter as she call...,when it comes to the safety of our drinking wa...,News,"January 7, 2016",1
3,u.s.-backed campaign against is in eastern syr...,beirut (reuters) - a u.s.-backed campaign agai...,worldnews,"October 18, 2017",0
4,"puerto rico oversight board orders furloughs, ...",(reuters) - puerto rico’s federally appointed ...,politicsNews,"August 4, 2017",0


In [10]:
# Remove links from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,windows 10 is stealing your bandwidth (you mig...,21st century wire says we ve heard a lot of no...,US_News,"April 7, 2016",1
1,kenya president: elections will go ahead despi...,nairobi (reuters) - kenyan president uhuru ken...,worldnews,"October 10, 2017",0
2,cher does not hold back on twitter as she call...,when it comes to the safety of our drinking wa...,News,"January 7, 2016",1
3,u.s.-backed campaign against is in eastern syr...,beirut (reuters) - a u.s.-backed campaign agai...,worldnews,"October 18, 2017",0
4,"puerto rico oversight board orders furloughs, ...",(reuters) - puerto rico’s federally appointed ...,politicsNews,"August 4, 2017",0


In [11]:
# Remove newlines from text and title

combined_df['text'] = combined_df['text'].str.replace('\n', ' ')

combined_df['title'] = combined_df['title'].str.replace('\n', ' ')

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,windows 10 is stealing your bandwidth (you mig...,21st century wire says we ve heard a lot of no...,US_News,"April 7, 2016",1
1,kenya president: elections will go ahead despi...,nairobi (reuters) - kenyan president uhuru ken...,worldnews,"October 10, 2017",0
2,cher does not hold back on twitter as she call...,when it comes to the safety of our drinking wa...,News,"January 7, 2016",1
3,u.s.-backed campaign against is in eastern syr...,beirut (reuters) - a u.s.-backed campaign agai...,worldnews,"October 18, 2017",0
4,"puerto rico oversight board orders furloughs, ...",(reuters) - puerto rico’s federally appointed ...,politicsNews,"August 4, 2017",0


In [12]:
# Remove words containing numbers from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,windows is stealing your bandwidth (you might...,century wire says we ve heard a lot of not so...,US_News,"April 7, 2016",1
1,kenya president: elections will go ahead despi...,nairobi (reuters) - kenyan president uhuru ken...,worldnews,"October 10, 2017",0
2,cher does not hold back on twitter as she call...,when it comes to the safety of our drinking wa...,News,"January 7, 2016",1
3,u.s.-backed campaign against is in eastern syr...,beirut (reuters) - a u.s.-backed campaign agai...,worldnews,"October 18, 2017",0
4,"puerto rico oversight board orders furloughs, ...",(reuters) - puerto rico’s federally appointed ...,politicsNews,"August 4, 2017",0


In [13]:
# Remove extra spaces from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,windows is stealing your bandwidth (you might ...,century wire says we ve heard a lot of not so ...,US_News,"April 7, 2016",1
1,kenya president: elections will go ahead despi...,nairobi (reuters) - kenyan president uhuru ken...,worldnews,"October 10, 2017",0
2,cher does not hold back on twitter as she call...,when it comes to the safety of our drinking wa...,News,"January 7, 2016",1
3,u.s.-backed campaign against is in eastern syr...,beirut (reuters) - a u.s.-backed campaign agai...,worldnews,"October 18, 2017",0
4,"puerto rico oversight board orders furloughs, ...",(reuters) - puerto rico’s federally appointed ...,politicsNews,"August 4, 2017",0


In [14]:
# Remove special characters from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,windows is stealing your bandwidth you might w...,century wire says we ve heard a lot of not so ...,US_News,"April 7, 2016",1
1,kenya president elections will go ahead despit...,nairobi reuters kenyan president uhuru kenyat...,worldnews,"October 10, 2017",0
2,cher does not hold back on twitter as she call...,when it comes to the safety of our drinking wa...,News,"January 7, 2016",1
3,usbacked campaign against is in eastern syria ...,beirut reuters a usbacked campaign against is...,worldnews,"October 18, 2017",0
4,puerto rico oversight board orders furloughs g...,reuters puerto ricos federally appointed fina...,politicsNews,"August 4, 2017",0


In [15]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [16]:
# Remove stop words from text
filtered_texts = []
for text in combined_df['text']:
    filtered_text = ' '.join([word for word in text.split() if word not in stop_words])
    filtered_texts.append(filtered_text)
combined_df['text'] = filtered_texts

In [17]:
filtered_titles = []
for title in combined_df['title']:
    filtered_title = ' '.join([word for word in title.split() if word not in stop_words])
    filtered_titles.append(filtered_title)
combined_df['title'] = filtered_titles
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,windows stealing bandwidth might want delete,century wire says heard lot nice things micros...,US_News,"April 7, 2016",1
1,kenya president elections go ahead despite opp...,nairobi reuters kenyan president uhuru kenyatt...,worldnews,"October 10, 2017",0
2,cher hold back twitter calls death michigan go...,comes safety drinking water comes taps left tr...,News,"January 7, 2016",1
3,usbacked campaign eastern syria speed sdf militia,beirut reuters usbacked campaign islamic state...,worldnews,"October 18, 2017",0
4,puerto rico oversight board orders furloughs g...,reuters puerto ricos federally appointed finan...,politicsNews,"August 4, 2017",0


In [18]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [19]:
# Lemmatize text
lemmatizer = WordNetLemmatizer()

lemmatized_texts = []
for text in combined_df['text']:
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    lemmatized_texts.append(lemmatized_text)
combined_df['text'] = lemmatized_texts

In [20]:
# Lemmatize title
lemmatized_titles = []
for title in combined_df['title']:
    lemmatized_title = ' '.join([lemmatizer.lemmatize(word) for word in title.split()])
    lemmatized_titles.append(lemmatized_title)
combined_df['title'] = lemmatized_titles

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,window stealing bandwidth might want delete,century wire say heard lot nice thing microsof...,US_News,"April 7, 2016",1
1,kenya president election go ahead despite oppo...,nairobi reuters kenyan president uhuru kenyatt...,worldnews,"October 10, 2017",0
2,cher hold back twitter call death michigan gov...,come safety drinking water come tap left trust...,News,"January 7, 2016",1
3,usbacked campaign eastern syria speed sdf militia,beirut reuters usbacked campaign islamic state...,worldnews,"October 18, 2017",0
4,puerto rico oversight board order furlough gov...,reuters puerto rico federally appointed financ...,politicsNews,"August 4, 2017",0


Model Building
1) Machine Learning model
   - Random Forest
2) Deep Learning model
   - LSTM
3) BERT Based model
   - Transformer

In [21]:
# Combine the title, text, and subject columns into a single column
combined_df['combined_text'] = combined_df['title'] + ' ' + combined_df['text'] + ' ' + combined_df['subject']

In [22]:
# Text preprocessing and TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X = vectorizer.fit_transform(combined_df['combined_text'])
y = combined_df['label']

In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [25]:
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.9948775055679288


In [26]:
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4202
           1       1.00      0.99      1.00      4778

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



LSTM Model

In [29]:
# Tokenize the text
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(combined_df['combined_text'])
X = tokenizer.texts_to_sequences(combined_df['combined_text'])


In [30]:
# Pad the sequences
max_len = 100
X = pad_sequences(X, maxlen=max_len)

In [31]:
y = combined_df['label'].values

In [32]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
#LSTM Model 
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [35]:
# Train the model
model.fit(X_train, y_train,
          batch_size=32,
          epochs=5,
          validation_data=(X_test, y_test),
          verbose=2)

Epoch 1/5
1123/1123 - 144s - loss: 0.0240 - accuracy: 0.9937 - val_loss: 8.6710e-05 - val_accuracy: 1.0000 - 144s/epoch - 128ms/step
Epoch 2/5
1123/1123 - 138s - loss: 3.1778e-05 - accuracy: 1.0000 - val_loss: 4.0546e-05 - val_accuracy: 1.0000 - 138s/epoch - 123ms/step
Epoch 3/5
1123/1123 - 139s - loss: 1.0492e-05 - accuracy: 1.0000 - val_loss: 1.8659e-05 - val_accuracy: 1.0000 - 139s/epoch - 124ms/step
Epoch 4/5
1123/1123 - 131s - loss: 5.4758e-06 - accuracy: 1.0000 - val_loss: 1.8357e-05 - val_accuracy: 1.0000 - 131s/epoch - 117ms/step
Epoch 5/5
1123/1123 - 132s - loss: 2.4780e-06 - accuracy: 1.0000 - val_loss: 6.9744e-06 - val_accuracy: 1.0000 - 132s/epoch - 117ms/step


<keras.src.callbacks.History at 0x272a2299850>

In [36]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")



In [37]:
print("LSTM Model Accuracy:", accuracy_score(y_test, y_pred))

LSTM Model Accuracy: 1.0


In [38]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4202
           1       1.00      1.00      1.00      4778

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

