In [38]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout

In [2]:
# Load the datasets
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

In [3]:
# Add a column to label the data
true_df['label'] = 0
fake_df['label'] = 1

In [4]:
# Combine the datasets
combined_df = pd.concat([true_df, fake_df], ignore_index=True)

# Shuffle the combined dataset (optional)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# Save the combined dataset to a new CSV file
combined_df.to_csv('Combined.csv', index=False)

In [5]:
combined_df.shape

(44898, 5)

In [6]:
combined_df.head

<bound method NDFrame.head of                                                    title  \
0      Grandmas, grandpas from travel ban states now ...   
1      LexisNexis withdrew two products from Chinese ...   
2      White House budget chief expects delay in hitt...   
3      Indian Prime Minister Modi to visit Washington...   
4      LIST OF 3 COMPANIES WHO Caved To Leftists…PULL...   
...                                                  ...   
44893   Confused Old Man Forgets He’s At Arlington Ce...   
44894  LORD’S PRAYER AD BANNED…One Month Later Muslim...   
44895   Trump Royally F*cks Veterans By Considering S...   
44896  Britain and EU fail to agree Brexit deal on Mo...   
44897  WATCH JUDGE TELL DISRESPECTFUL GANGBANGER Taxp...   

                                                    text          subject  \
0      WASHINGTON (Reuters) - Grandparents of U.S. ci...     politicsNews   
1      LONDON (Reuters) - LexisNexis, a provider of l...        worldnews   
2      WASHINGTON 

In [7]:
combined_df.describe

<bound method NDFrame.describe of                                                    title  \
0      Grandmas, grandpas from travel ban states now ...   
1      LexisNexis withdrew two products from Chinese ...   
2      White House budget chief expects delay in hitt...   
3      Indian Prime Minister Modi to visit Washington...   
4      LIST OF 3 COMPANIES WHO Caved To Leftists…PULL...   
...                                                  ...   
44893   Confused Old Man Forgets He’s At Arlington Ce...   
44894  LORD’S PRAYER AD BANNED…One Month Later Muslim...   
44895   Trump Royally F*cks Veterans By Considering S...   
44896  Britain and EU fail to agree Brexit deal on Mo...   
44897  WATCH JUDGE TELL DISRESPECTFUL GANGBANGER Taxp...   

                                                    text          subject  \
0      WASHINGTON (Reuters) - Grandparents of U.S. ci...     politicsNews   
1      LONDON (Reuters) - LexisNexis, a provider of l...        worldnews   
2      WASHING

Data Preprocessing
1) Lower Case
2) Removing links
3) Removing next lines (\n)
4) Words containing numbers
5) Extra spaces
6) Special characters
7) Removal of stop words
8) Stemming
9) Lemmatization

In [8]:
#converting column "text" to lower case

combined_df['text'] = combined_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"Grandmas, grandpas from travel ban states now ...",washington (reuters) - grandparents of u.s. ci...,politicsNews,"July 17, 2017",0
1,LexisNexis withdrew two products from Chinese ...,"london (reuters) - lexisnexis, a provider of l...",worldnews,"August 22, 2017",0
2,White House budget chief expects delay in hitt...,washington (reuters) - white house budget chie...,politicsNews,"February 28, 2017",0
3,Indian Prime Minister Modi to visit Washington...,washington (reuters) - u.s. president donald t...,politicsNews,"March 28, 2017",0
4,LIST OF 3 COMPANIES WHO Caved To Leftists…PULL...,"companies including cars.com, peloton, and lee...",left-news,"May 25, 2017",1


In [9]:
#Converting "title" column to lower case

combined_df['title'] = combined_df['title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"grandmas, grandpas from travel ban states now ...",washington (reuters) - grandparents of u.s. ci...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...",worldnews,"August 22, 2017",0
2,white house budget chief expects delay in hitt...,washington (reuters) - white house budget chie...,politicsNews,"February 28, 2017",0
3,indian prime minister modi to visit washington...,washington (reuters) - u.s. president donald t...,politicsNews,"March 28, 2017",0
4,list of 3 companies who caved to leftists…pull...,"companies including cars.com, peloton, and lee...",left-news,"May 25, 2017",1


In [10]:
# Remove links from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"grandmas, grandpas from travel ban states now ...",washington (reuters) - grandparents of u.s. ci...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...",worldnews,"August 22, 2017",0
2,white house budget chief expects delay in hitt...,washington (reuters) - white house budget chie...,politicsNews,"February 28, 2017",0
3,indian prime minister modi to visit washington...,washington (reuters) - u.s. president donald t...,politicsNews,"March 28, 2017",0
4,list of 3 companies who caved to leftists…pull...,"companies including cars.com, peloton, and lee...",left-news,"May 25, 2017",1


In [11]:
# Remove newlines from text and title

combined_df['text'] = combined_df['text'].str.replace('\n', ' ')

combined_df['title'] = combined_df['title'].str.replace('\n', ' ')

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"grandmas, grandpas from travel ban states now ...",washington (reuters) - grandparents of u.s. ci...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...",worldnews,"August 22, 2017",0
2,white house budget chief expects delay in hitt...,washington (reuters) - white house budget chie...,politicsNews,"February 28, 2017",0
3,indian prime minister modi to visit washington...,washington (reuters) - u.s. president donald t...,politicsNews,"March 28, 2017",0
4,list of 3 companies who caved to leftists…pull...,"companies including cars.com, peloton, and lee...",left-news,"May 25, 2017",1


In [12]:
# Remove words containing numbers from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"grandmas, grandpas from travel ban states now ...",washington (reuters) - grandparents of u.s. ci...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...",worldnews,"August 22, 2017",0
2,white house budget chief expects delay in hitt...,washington (reuters) - white house budget chie...,politicsNews,"February 28, 2017",0
3,indian prime minister modi to visit washington...,washington (reuters) - u.s. president donald t...,politicsNews,"March 28, 2017",0
4,list of companies who caved to leftists…pulle...,"companies including cars.com, peloton, and lee...",left-news,"May 25, 2017",1


In [13]:
# Remove extra spaces from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"grandmas, grandpas from travel ban states now ...",washington (reuters) - grandparents of u.s. ci...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two products from chinese ...,"london (reuters) - lexisnexis, a provider of l...",worldnews,"August 22, 2017",0
2,white house budget chief expects delay in hitt...,washington (reuters) - white house budget chie...,politicsNews,"February 28, 2017",0
3,indian prime minister modi to visit washington...,washington (reuters) - u.s. president donald t...,politicsNews,"March 28, 2017",0
4,list of companies who caved to leftists…pulled...,"companies including cars.com, peloton, and lee...",left-news,"May 25, 2017",1


In [14]:
# Remove special characters from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,grandmas grandpas from travel ban states now w...,washington reuters grandparents of us citizen...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two products from chinese ...,london reuters lexisnexis a provider of legal...,worldnews,"August 22, 2017",0
2,white house budget chief expects delay in hitt...,washington reuters white house budget chief m...,politicsNews,"February 28, 2017",0
3,indian prime minister modi to visit washington...,washington reuters us president donald trump ...,politicsNews,"March 28, 2017",0
4,list of companies who caved to leftistspulled ...,companies including carscom peloton and leesa ...,left-news,"May 25, 2017",1


In [15]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [16]:
# Remove stop words from text
filtered_texts = []
for text in combined_df['text']:
    filtered_text = ' '.join([word for word in text.split() if word not in stop_words])
    filtered_texts.append(filtered_text)
combined_df['text'] = filtered_texts

In [17]:
filtered_titles = []
for title in combined_df['title']:
    filtered_title = ' '.join([word for word in title.split() if word not in stop_words])
    filtered_titles.append(filtered_title)
combined_df['title'] = filtered_titles
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,grandmas grandpas travel ban states welcome us...,washington reuters grandparents us citizens si...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two products chinese market,london reuters lexisnexis provider legal regul...,worldnews,"August 22, 2017",0
2,white house budget chief expects delay hitting...,washington reuters white house budget chief mi...,politicsNews,"February 28, 2017",0
3,indian prime minister modi visit washington ye...,washington reuters us president donald trump s...,politicsNews,"March 28, 2017",0
4,list companies caved leftistspulled ads hannit...,companies including carscom peloton leesa slee...,left-news,"May 25, 2017",1


In [18]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [19]:
# Lemmatize text
lemmatizer = WordNetLemmatizer()

lemmatized_texts = []
for text in combined_df['text']:
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    lemmatized_texts.append(lemmatized_text)
combined_df['text'] = lemmatized_texts

In [20]:
# Lemmatize title
lemmatized_titles = []
for title in combined_df['title']:
    lemmatized_title = ' '.join([lemmatizer.lemmatize(word) for word in title.split()])
    lemmatized_titles.append(lemmatized_title)
combined_df['title'] = lemmatized_titles

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,grandma grandpa travel ban state welcome u cable,washington reuters grandparent u citizen six m...,politicsNews,"July 17, 2017",0
1,lexisnexis withdrew two product chinese market,london reuters lexisnexis provider legal regul...,worldnews,"August 22, 2017",0
2,white house budget chief expects delay hitting...,washington reuters white house budget chief mi...,politicsNews,"February 28, 2017",0
3,indian prime minister modi visit washington ye...,washington reuters u president donald trump sp...,politicsNews,"March 28, 2017",0
4,list company caved leftistspulled ad hannity s...,company including carscom peloton leesa sleep ...,left-news,"May 25, 2017",1


Model Building
1) Machine Learning model
   - Random Forest
2) Deep Learning model
   - LSTM
3) Convolutional Neural Network model

In [21]:
# Combine the title, text, and subject columns into a single column
combined_df['combined_text'] = combined_df['title'] + ' ' + combined_df['text'] + ' ' + combined_df['subject']

In [22]:
# Text preprocessing and TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X = vectorizer.fit_transform(combined_df['combined_text'])
y = combined_df['label']

In [48]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [60]:
# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [62]:
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.9835189309576837


In [63]:
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      4314
           1       1.00      0.97      0.98      4666

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



LSTM Model

In [40]:
# Tokenize the text
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(combined_df['combined_text'])
X = tokenizer.texts_to_sequences(combined_df['combined_text'])

In [41]:
# Pad the sequences
max_len = 100
X = pad_sequences(X, maxlen=max_len)

In [42]:
y = combined_df['label'].values

In [64]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [65]:
#LSTM Model 
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [66]:
# Train the model
model.fit(X_train, y_train,
          batch_size=32,
          epochs=5,
          validation_data=(X_test, y_test),
          verbose=2)

Epoch 1/5
1123/1123 - 115s - loss: 0.0229 - accuracy: 0.9970 - val_loss: 8.0527e-05 - val_accuracy: 1.0000 - 115s/epoch - 102ms/step
Epoch 2/5
1123/1123 - 119s - loss: 3.1247e-05 - accuracy: 1.0000 - val_loss: 1.4239e-05 - val_accuracy: 1.0000 - 119s/epoch - 106ms/step
Epoch 3/5
1123/1123 - 119s - loss: 9.6868e-06 - accuracy: 1.0000 - val_loss: 6.5079e-06 - val_accuracy: 1.0000 - 119s/epoch - 106ms/step
Epoch 4/5
1123/1123 - 116s - loss: 4.3523e-06 - accuracy: 1.0000 - val_loss: 3.5974e-06 - val_accuracy: 1.0000 - 116s/epoch - 103ms/step
Epoch 5/5
1123/1123 - 146s - loss: 2.4249e-06 - accuracy: 1.0000 - val_loss: 2.0984e-06 - val_accuracy: 1.0000 - 146s/epoch - 130ms/step


<keras.src.callbacks.History at 0x1a8dd6f1d90>

In [67]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")



In [68]:
print("LSTM Model Accuracy:", accuracy_score(y_test, y_pred))

LSTM Model Accuracy: 1.0


In [69]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4314
           1       1.00      1.00      1.00      4666

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



Convolutional Neural Network Model

In [52]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(combined_df['label'])

In [54]:
# CNN Model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test), verbose=2)

Epoch 1/5
1123/1123 - 39s - loss: 0.0491 - accuracy: 0.9727 - val_loss: 4.3163e-05 - val_accuracy: 1.0000 - 39s/epoch - 35ms/step
Epoch 2/5
1123/1123 - 25s - loss: 7.3830e-06 - accuracy: 1.0000 - val_loss: 1.0454e-06 - val_accuracy: 1.0000 - 25s/epoch - 23ms/step
Epoch 3/5
1123/1123 - 26s - loss: 1.6799e-06 - accuracy: 1.0000 - val_loss: 5.2580e-07 - val_accuracy: 1.0000 - 26s/epoch - 23ms/step
Epoch 4/5
1123/1123 - 37s - loss: 9.2485e-07 - accuracy: 1.0000 - val_loss: 2.6016e-07 - val_accuracy: 1.0000 - 37s/epoch - 33ms/step
Epoch 5/5
1123/1123 - 26s - loss: 6.9820e-07 - accuracy: 1.0000 - val_loss: 1.2794e-07 - val_accuracy: 1.0000 - 26s/epoch - 24ms/step


<keras.src.callbacks.History at 0x1a8dd316c90>

In [56]:
# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
#y_pred



In [57]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 1.0000


In [59]:
# Print classification report
target_names = ['True', 'Fake']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        True       1.00      1.00      1.00      4314
        Fake       1.00      1.00      1.00      4666

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

