In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('fake_news_cleaned_data.csv')

In [5]:
news_dataset.head()

Unnamed: 0,title,text,date,source,author,category,label,content,clean_content
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real,Foreign Democrat final. more tax development b...,foreign democrat final tax development store a...
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake,To offer down resource great point. probably g...,offer resource great point probably guess west...
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake,Himself church myself carry. them identify for...,church carry identify forward present success ...
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake,You unit its should. phone which item yard Rep...,unit phone item yard republican safe police id...
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake,Billion believe employee summer how. wonder my...,billion believe employee summer wonder fact di...


In [12]:

news_dataset = pd.read_csv('fake_news_cleaned_data.csv')
news_dataset['label'] = (
    news_dataset['label']
      .astype(str).str.strip().str.lower()
      .map({'real': 0, 'fake': 1})
)


assert news_dataset['label'].isin([0, 1]).all(), "Unexpected labels found; expected 'real' or 'fake'"


In [18]:
news_dataset = news_dataset.rename(columns={'clean_content': 'text'})

In [19]:
news_dataset.head()

Unnamed: 0,title,author,label,text
0,Foreign Democrat final.,Paula George,0,foreign democrat final tax development store a...
1,To offer down resource great point.,Joseph Hill,1,offer resource great point probably guess west...
2,Himself church myself carry.,Julia Robinson,1,church carry identify forward present success ...
3,You unit its should.,Mr. David Foster DDS,1,unit phone item yard republican safe police id...
4,Billion believe employee summer how.,Austin Walker,1,billion believe employee summer wonder fact di...


In [20]:
news_dataset.isnull().sum()

title        0
author    1000
label        0
text         0
dtype: int64

In [21]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [22]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [23]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [24]:
print(X)
print(Y)

                                       title                author  \
0                    Foreign Democrat final.          Paula George   
1        To offer down resource great point.           Joseph Hill   
2               Himself church myself carry.        Julia Robinson   
3                       You unit its should.  Mr. David Foster DDS   
4       Billion believe employee summer how.         Austin Walker   
...                                      ...                   ...   
19995                      House party born.            Gary Miles   
19996  Though nation people maybe price box.         Maria Mcbride   
19997        Yet exist with experience unit.      Kristen Franklin   
19998               School wide itself item.            David Wise   
19999         Offer chair cover senior born.        James Peterson   

                                                    text  \
0      foreign democrat final tax development store a...   
1      offer resource great point proba

In [25]:
port_stem = PorterStemmer()

In [26]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [27]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [28]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [29]:
print(X)
print(Y)

['paula georg foreign democrat final'
 'joseph hill offer resourc great point' 'julia robinson church carri' ...
 'kristen franklin yet exist experi unit' 'david wise school wide item'
 'jame peterson offer chair cover senior born']
[0 1 1 ... 0 1 1]


In [30]:
Y.shape

(20000,)

In [31]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [32]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 136889 stored elements and shape (20000, 2320)>
  Coords	Values
  (0, 553)	0.39378500362567437
  (0, 729)	0.38563113779991653
  (0, 760)	0.4303325215423282
  (0, 820)	0.4702667357326091
  (0, 1607)	0.5384098017728571
  (1, 855)	0.4075140492253222
  (1, 945)	0.4349807003583625
  (1, 1106)	0.39395595700897607
  (1, 1544)	0.4059231913258755
  (1, 1650)	0.40644929622403225
  (1, 1761)	0.3994347584771229
  (2, 340)	0.4619493340010572
  (2, 402)	0.46849498889312446
  (2, 1116)	0.5803950203356232
  (2, 1802)	0.4798508920150325
  (3, 529)	0.3893432059572742
  (3, 535)	0.46879786583185773
  (3, 765)	0.5331103685464007
  (3, 1474)	0.36176526198760245
  (3, 2168)	0.4621251552704844
  (4, 128)	0.4431492340687908
  (4, 182)	0.39333180511748317
  (4, 209)	0.39858312187055117
  (4, 655)	0.41121899358509134
  (4, 2054)	0.3943487167650092
  :	:
  (19996, 1316)	0.38290410625708515
  (19996, 1345)	0.3396980161532583
  (19996, 1349)	0.446572338

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [34]:
model = LogisticRegression()
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [35]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [36]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.6490625


In [37]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [38]:
print('Accuracy score of the test data : ', test_data_accuracy)



Accuracy score of the test data :  0.4945


In [39]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [40]:
print(Y_test[3])

0
