In [1]:
# Importing Libraries
import pandas as pd
import numpy as np

True_news = pd.read_csv("Real.csv")
Fake_news = pd.read_csv("Fake.csv")
True_news['label'] = 0
Fake_news['label'] = 1

dataset1 = True_news[['text', 'label']]
dataset2 = Fake_news[['text', 'label']]

dataset = pd.concat([dataset1, dataset2])
dataset.shape
dataset.isnull().sum() # To check is dataset has Null Values

dataset['label'].value_counts()
dataset = dataset.sample(frac = 1) # Shuffling all the values


In [2]:
# Computer cant understand TEXT data so we use NLP

# NLP Process
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Download the required NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')  # Download the stopwords resource

# re is regex (for cleaning)
ps = WordNetLemmatizer()
stopwords = stopwords.words('english')




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_row(row):
    row = row.lower()
    row = re.sub('[^a-zA-Z]', ' ', row)
    token = row.split()

    news = [ps.lemmatize(word) for word in token if word not in stopwords]
    cleaned_news = ' '.join(news)
    return cleaned_news


dataset['text'] = dataset['text'].apply(lambda x: clean_row(x))
print(dataset['text'])

vectorizer = TfidfVectorizer(max_features=50000, lowercase=False, ngram_range=(1, 2))  # Corrected argument

X = dataset.iloc[:35000, 0]
y = dataset.iloc[:35000, 1]

from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_label = train_test_split(X, y, test_size=0.2, random_state=0)


4069     reuters u commerce secretary wilbur ross expec...
11124    san juan washington reuters democratic u senat...
17873    rome reuters italian government called tuesday...
8957     working service industry really test eventuall...
15570    marine veteran ran constitutional sheriff new ...
                               ...                        
6585     old navy posted completely innocent photo inte...
11099    washington reuters u democratic presidential c...
6838     age old question haunting american people seve...
4603     washington reuters native american tribe monta...
17355                                                     
Name: text, Length: 44898, dtype: object


In [4]:
# Vectorize the trained data
vec_train_data = vectorizer.fit_transform(train_data)
vec_train_data = vec_train_data.toarray()
vec_test_data = vectorizer.fit_transform(test_data)
vec_test_data = vec_test_data.toarray()
vec_train_data.shape, vec_test_data.shape

((28000, 50000), (7000, 50000))

In [5]:
# We use NLP rather than One Hot Encoder because we have so many Rows
train_data = pd.DataFrame(vec_train_data, columns = vectorizer.get_feature_names_out())
testing_data = pd.DataFrame(vec_test_data, columns = vectorizer.get_feature_names_out())

train_data

Unnamed: 0,aa,aapi,aapl,aaron,aaron bernstein,aaron burr,aarp,ab,ababa,aback,...,zor eastern,zor province,zucker,zuckerberg,zuckerberg said,zuma,zuma said,zurich,zurich reuters,zweiman
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.412776,0.0,0.0
27996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
27997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
27998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [6]:
# Using Models (Modelling)

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(testing_data)

test_label


Unnamed: 0,label
2415,0
7124,0
2427,1
8363,1
18066,1
...,...
14724,1
12137,1
6699,0
14307,0


In [7]:
y_pred

array([0, 1, 1, ..., 1, 0, 0])

In [12]:
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

accuracy_score(test_label, y_pred)
y_pred_train = clf.predict(train_data)
accuracy_score(train_label, y_pred_train)

txt = 'news headline any'
news = clean_row(txt)
news
pred = clf.predict(vectorizer.transform([news]).toarray())
pred

txt = input("Enter the News Headline - ")
news = clean_row(str(txt))
pred = clf.predict(vectorizer.transform([news]).toarray())
print("\n")
if pred == 0:
 print("Wrong")
else:
 print("Correct")



Enter the News Headline - "After an awful campaign filled with hateful rhetoric, American voters elected a man to lead the most powerful country on earth even after he was accused of raping a 13-year-old. The year was 2016 and the accused was an alleged billionaire, former reality show star and an admitted sexual predator. Still, even after the revelations, conservatives saw nothing wrong with Donald Trump s behavior. The plaintiff described the horrifying incident in which Trump and his friend Jeffrey Epstein allegedly raped a child. A lawsuit was filed which claims that threats were made in order for the victim to keep her mouth shut about what had just happened.Both Defendants let Plaintiff know that each was a very wealthy, powerful man and indicated that they had the power, ability, and means to carry out their threats. Indeed, Defendant Trump stated that Plaintiff shouldn t ever say anything if she didn t want to disappear like Maria, a 12-year-old female that was forced to be in