In [29]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
true_df = pd.read_csv('/content/drive/MyDrive/Spring_board2/True.csv')

fake_df = pd.read_csv('/content/drive/MyDrive/Spring_board2/Fake.csv')

In [9]:
# Add a column to label the data
true_df['label'] = 0
fake_df['label'] = 1

In [10]:
# Combine the datasets
combined_df = pd.concat([true_df, fake_df], ignore_index=True)


# Save the combined dataset to a new CSV file
combined_df.to_csv('/content/drive/MyDrive/Spring_board2/Combined.csv', index=False)

In [11]:
combined_df.shape


(44898, 5)

In [12]:
combined_df.head

In [13]:
combined_df.describe

In [14]:
#converting column "text" to lower case

combined_df['text'] = combined_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
combined_df.head()

#Converting "title" column to lower case

combined_df['title'] = combined_df['title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
combined_df.head()

# Remove links from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"as u.s. budget fight looms, republicans flip t...",washington (reuters) - the head of a conservat...,politicsNews,"December 31, 2017",0
1,u.s. military to accept transgender recruits o...,washington (reuters) - transgender people will...,politicsNews,"December 29, 2017",0
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) - the special counsel inv...,politicsNews,"December 31, 2017",0
3,fbi russia probe helped by australian diplomat...,washington (reuters) - trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,trump wants postal service to charge 'much mor...,seattle/washington (reuters) - president donal...,politicsNews,"December 29, 2017",0


In [15]:
# Remove newlines from text and title

combined_df['text'] = combined_df['text'].str.replace('\n', ' ')

combined_df['title'] = combined_df['title'].str.replace('\n', ' ')

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"as u.s. budget fight looms, republicans flip t...",washington (reuters) - the head of a conservat...,politicsNews,"December 31, 2017",0
1,u.s. military to accept transgender recruits o...,washington (reuters) - transgender people will...,politicsNews,"December 29, 2017",0
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) - the special counsel inv...,politicsNews,"December 31, 2017",0
3,fbi russia probe helped by australian diplomat...,washington (reuters) - trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,trump wants postal service to charge 'much mor...,seattle/washington (reuters) - president donal...,politicsNews,"December 29, 2017",0


In [16]:
# Remove words containing numbers from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"as u.s. budget fight looms, republicans flip t...",washington (reuters) - the head of a conservat...,politicsNews,"December 31, 2017",0
1,u.s. military to accept transgender recruits o...,washington (reuters) - transgender people will...,politicsNews,"December 29, 2017",0
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) - the special counsel inv...,politicsNews,"December 31, 2017",0
3,fbi russia probe helped by australian diplomat...,washington (reuters) - trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,trump wants postal service to charge 'much mor...,seattle/washington (reuters) - president donal...,politicsNews,"December 29, 2017",0


In [17]:
# Remove extra spaces from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,"as u.s. budget fight looms, republicans flip t...",washington (reuters) - the head of a conservat...,politicsNews,"December 31, 2017",0
1,u.s. military to accept transgender recruits o...,washington (reuters) - transgender people will...,politicsNews,"December 29, 2017",0
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) - the special counsel inv...,politicsNews,"December 31, 2017",0
3,fbi russia probe helped by australian diplomat...,washington (reuters) - trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,trump wants postal service to charge 'much mor...,seattle/washington (reuters) - president donal...,politicsNews,"December 29, 2017",0


In [18]:
# Remove special characters from text and title

combined_df['text'] = combined_df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

combined_df['title'] = combined_df['title'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

combined_df.head()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Remove stop words from text
filtered_texts = []
for text in combined_df['text']:
    filtered_text = ' '.join([word for word in text.split() if word not in stop_words])
    filtered_texts.append(filtered_text)
combined_df['text'] = filtered_texts
filtered_titles = []
for title in combined_df['title']:
    filtered_title = ' '.join([word for word in title.split() if word not in stop_words])
    filtered_titles.append(filtered_title)
combined_df['title'] = filtered_titles
combined_df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,title,text,subject,date,label
0,us budget fight looms republicans flip fiscal ...,washington reuters head conservative republica...,politicsNews,"December 31, 2017",0
1,us military accept transgender recruits monday...,washington reuters transgender people allowed ...,politicsNews,"December 29, 2017",0
2,senior us republican senator let mr mueller job,washington reuters special counsel investigati...,politicsNews,"December 31, 2017",0
3,fbi russia probe helped australian diplomat ti...,washington reuters trump campaign adviser geor...,politicsNews,"December 30, 2017",0
4,trump wants postal service charge much amazon ...,seattlewashington reuters president donald tru...,politicsNews,"December 29, 2017",0


In [19]:
nltk.download('wordnet')
nltk.download('omw-1.4')
# Lemmatize text
lemmatizer = WordNetLemmatizer()

lemmatized_texts = []
for text in combined_df['text']:
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    lemmatized_texts.append(lemmatized_text)
combined_df['text'] = lemmatized_texts
# Lemmatize title
lemmatized_titles = []
for title in combined_df['title']:
    lemmatized_title = ' '.join([lemmatizer.lemmatize(word) for word in title.split()])
    lemmatized_titles.append(lemmatized_title)
combined_df['title'] = lemmatized_titles

combined_df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,title,text,subject,date,label
0,u budget fight loom republican flip fiscal script,washington reuters head conservative republica...,politicsNews,"December 31, 2017",0
1,u military accept transgender recruit monday p...,washington reuters transgender people allowed ...,politicsNews,"December 29, 2017",0
2,senior u republican senator let mr mueller job,washington reuters special counsel investigati...,politicsNews,"December 31, 2017",0
3,fbi russia probe helped australian diplomat ti...,washington reuters trump campaign adviser geor...,politicsNews,"December 30, 2017",0
4,trump want postal service charge much amazon s...,seattlewashington reuters president donald tru...,politicsNews,"December 29, 2017",0


In [20]:
# Combine the title, text, and subject columns into a single column
combined_df['combined_text'] = combined_df['title'] + ' ' + combined_df['text'] + ' ' + combined_df['subject']

In [21]:
# Text preprocessing and TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X = vectorizer.fit_transform(combined_df['combined_text'])
y = combined_df['label']

In [22]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [23]:

from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9902004454342984


In [25]:
import numpy as np
# Create a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_df['combined_text'])

# Get the word index
word_index = tokenizer.word_index

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(combined_df['combined_text'])

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=100)

# Create a model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 128, input_length=100))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(padded_sequences, y, epochs=5, batch_size=128, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(padded_sequences)
y_pred = np.round(y_pred)
accuracy = accuracy_score(y, y_pred)
print("Accuracy:", accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.9948104592632189
