## Adding stopwords


In [1]:
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Getting the data

In [8]:
import pandas as pd
data = pd.read_csv('/content/FakeNewsNet.csv')
data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [9]:
#checking how many null value are there
data.isnull().sum()

Unnamed: 0,0
title,0
news_url,330
source_domain,330
tweet_num,0
real,0


In [10]:
data.shape

(23196, 5)

In [11]:
#filling the null values with blankspaces
data = data.fillna("")

In [12]:
#making 1 value for textual data
data["content"] = data['title'] + data['source_domain']
data.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real,content
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1,Kandi Burruss Explodes Over Rape Accusation on...
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1,People's Choice Awards 2018: The best red carp...
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1,Sophia Bush Sends Sweet Birthday Message to 'O...
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1,Colombian singer Maluma sparks rumours of inap...
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1,Gossip Girl 10 Years Later: How Upper East Sid...


## Stemming the data
Stem is the process of converting words with same meaning into one

In [13]:
from nltk.stem.porter import PorterStemmer
port_stem = PorterStemmer()
import re

In [14]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [15]:
data['content'] = data['content'].apply(stemming)

In [24]:
x = data['content'].values
y = data['real'].values

## Converting the text into vector form

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer()
vector.fit(x)
x = vector.transform(x)

In [30]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 229969 stored elements and shape (23196, 21873)>
  Coords	Values
  (0, 101)	0.2560447024621976
  (0, 1075)	0.3038219190961781
  (0, 2659)	0.3770079843998082
  (0, 3838)	0.049450906201026955
  (0, 6471)	0.354692046772145
  (0, 9063)	0.2573386437226611
  (0, 10159)	0.3770079843998082
  (0, 15461)	0.2972956472514076
  (0, 15537)	0.22547839257851046
  (0, 16020)	0.2669073046379546
  (0, 19705)	0.3196115848790708
  (0, 20720)	0.22532645570900353
  (1, 1187)	0.2999922938709716
  (1, 1779)	0.30177156263264854
  (1, 2970)	0.36313707108189985
  (1, 3461)	0.37773997199945114
  (1, 3838)	0.0724946217871283
  (1, 11275)	0.4860091322682343
  (1, 14278)	0.28543182293885067
  (1, 15639)	0.334944629545033
  (1, 19647)	0.3305494690976824
  (2, 1939)	0.20182479556112762
  (2, 2420)	0.3715465468941152
  (2, 2664)	0.33214118532372394
  (2, 2667)	0.2678738086883315
  :	:
  (23194, 6397)	0.244143083727504
  (23194, 6794)	0.26716022801708195
  (23

##Splitting the data


In [38]:
from sklearn.model_selection import train_test_split
x_test, x_train, y_test, y_train = train_test_split(x, y, train_size=0.2, stratify=y, random_state=2)
print(x.shape, x_test.shape, x_train.shape)

(23196, 21873) (4639, 21873) (18557, 21873)


##Making the model Logistic Regression

In [39]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)

In [40]:
predict = model.predict(x_test)

##Checking the accuracy of the model

In [41]:
from sklearn.metrics import accuracy_score
accurate = accuracy_score(predict, y_test)
print(f"The accuracy is {accurate :.2f}")

The accuracy is 0.84


##Predicting with real time data

In [50]:
k = 180
input = x_test[k]
pre = model.predict(input)
if pre[0]== '1':
  print("The news is real")
else:
  print("The news is fake")
print(y_test[k])

The news is fake
0
