<a href="https://colab.research.google.com/github/Saksham2874/Ml_Systems/blob/main/Fake_News_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import numpy as np
import pandas as pd
import re # for searching words in a text or para
# nltk ~ natural language toolkit
from nltk.corpus import stopwords # for removing unwanted words like the,is,etc
from nltk.stem.porter import PorterStemmer # used to stem words ~ gives root/base word of any particular word
from sklearn.feature_extraction.text import TfidfVectorizer # (Tfidf ~ Term Frequency–Inverse Document Frequency) transforms text into a numerical representation that shows how important a word is in a document relative to the entire dataset.
from sklearn.model_selection import train_test_split # split dataset into train and test data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from datasets import load_dataset

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# words that 'stopwords' removes from the dataset
print(stopwords.words('english'))
# these words dont have any value to our dataset, so during stemming procedure we'll remove these stopwords from the dataset

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Datacollection and Pre-processing

In [28]:
splits = {'train': 'train.tsv', 'validation': 'validation.tsv', 'test': 'test.tsv'}
try:
    news_dataset = pd.read_csv("hf://datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/" + splits["train"], sep="\t")
except Exception as e:
    print(f"Error loading the dataset: {e}")
    print("Please ensure the dataset is accessible and the file path is correct.")

In [29]:
news_dataset.shape

(30000, 6)

In [30]:
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0


In [31]:
news_dataset.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [32]:
# merging title and subject
news_dataset['content'] = news_dataset['title'] + ' ' + news_dataset['subject']

In [34]:
print(news_dataset['content'])

0        Ex-CIA head says Trump remarks on Russia inter...
1        YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...
2        Federal Reserve governor Powell's policy views...
3        SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...
4        NANCY PELOSI ARROGANTLY DISMISSES Questions on...
                               ...                        
29995    U.S. aerospace industry urges Trump to help Ex...
29996    Highlights: Hong Kong leader Carrie Lam delive...
29997    Obama Literally LAUGHS At Claims That Brexit M...
29998    Syrian army takes full control of Deir al-Zor ...
29999    U.S., Israel sign $38 billion military aid pac...
Name: content, Length: 30000, dtype: object


In [35]:
# seperating the data and label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [36]:
print(X)
print(Y)

       Unnamed: 0                                              title  \
0            2619  Ex-CIA head says Trump remarks on Russia inter...   
1           16043  YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...   
2             876  Federal Reserve governor Powell's policy views...   
3           19963  SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...   
4           10783  NANCY PELOSI ARROGANTLY DISMISSES Questions on...   
...           ...                                                ...   
29995        6880  U.S. aerospace industry urges Trump to help Ex...   
29996       17818  Highlights: Hong Kong leader Carrie Lam delive...   
29997        5689  Obama Literally LAUGHS At Claims That Brexit M...   
29998       15805  Syrian army takes full control of Deir al-Zor ...   
29999        8143  U.S., Israel sign $38 billion military aid pac...   

                                                    text          subject  \
0      Former CIA director John Brennan on Friday cri...  

Stemming Procedure

In [37]:
# actor,actress,acting ~ act
port_stem = PorterStemmer()

In [38]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content


In [39]:
 news_dataset['content'] = news_dataset['content'].apply(stemming)

In [40]:
print(news_dataset['content'])

0        ex cia head say trump remark russia interfer d...
1        believ punish hispan store owner swindl tax pa...
2        feder reserv governor powel polici view word p...
3        scoundrel hillari support start trumpleak camp...
4        nanci pelosi arrogantli dismiss question crook...
                               ...                        
29995    u aerospac industri urg trump help ex im bank ...
29996    highlight hong kong leader carri lam deliv mai...
29997    obama liter laugh claim brexit mean trump win ...
29998    syrian armi take full control deir al zor isla...
29999    u israel sign billion militari aid packag poli...
Name: content, Length: 30000, dtype: object


In [41]:
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [42]:
print(X)

['ex cia head say trump remark russia interfer disgrac politicsnew'
 'believ punish hispan store owner swindl tax payer latest food stamp scam govern news'
 'feder reserv governor powel polici view word politicsnew' ...
 'obama liter laugh claim brexit mean trump win elect video news'
 'syrian armi take full control deir al zor islam state observatori worldnew'
 'u israel sign billion militari aid packag politicsnew']


In [43]:
print(Y)

[1 0 1 ... 0 1 1]


In [46]:
X.shape
Y.shape

(30000,)

In [47]:
# converting textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [48]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 309267 stored elements and shape (30000, 11843)>
  Coords	Values
  (0, 1866)	0.36722313241697907
  (0, 2944)	0.4413058719824344
  (0, 3550)	0.3255159284614211
  (0, 4754)	0.3267213608925313
  (0, 5327)	0.4248428972703751
  (0, 7938)	0.1312138747094723
  (0, 8620)	0.38422875309693744
  (0, 8970)	0.25091956354802575
  (0, 9103)	0.18775290011789209
  (0, 10777)	0.11889806208899584
  (1, 916)	0.24431727943836326
  (1, 3974)	0.26766145283163745
  (1, 4421)	0.15531497583796633
  (1, 4877)	0.28474704111640375
  (1, 5855)	0.24631019736663337
  (1, 7053)	0.0771399684351476
  (1, 7489)	0.2574213130641217
  (1, 7651)	0.3447996968901494
  (1, 8241)	0.2671543745637323
  (1, 9111)	0.2917270121630078
  (1, 9913)	0.29909174875516575
  (1, 10027)	0.2907816838980644
  (1, 10246)	0.37987883339610334
  (1, 10362)	0.19208603186890474
  (2, 3764)	0.3445416808214119
  :	:
  (29997, 6478)	0.42412721239722606
  (29997, 7053)	0.11872478359550279
  (2

In [49]:
# splitting the dataset to training and test data
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

Training the model

In [53]:
model = LogisticRegression()

In [54]:
model.fit(X_train, Y_train)

In [55]:
# accuracy score
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)


In [56]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9999166666666667


In [57]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [58]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  1.0


Predictive System

In [64]:
X_new = X_test[111]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake
