In [1]:
import pandas as pd
import numpy as np
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stopwords of the english 
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
df=pd.read_csv('FakeNewsNet.csv')

In [5]:
df.shape

(23196, 5)

In [6]:
df.sample(10)

Unnamed: 0,title,news_url,source_domain,tweet_num,real
2257,"Angelina Jolie, 'Focused on Healing Her Family...",people.com/movies/angelina-jolie-splits-lawyer...,people.com,57,0
8056,"Coachella 2018 lineup: Beyoncé, Eminem and The...",https://www.usatoday.com/story/life/music/2018...,www.usatoday.com,678,1
2757,Did Demi Lovato Throw Shade at Kathy Griffin?,https://www.usmagazine.com/celebrity-news/news...,www.usmagazine.com,114,1
11342,Ben Affleck treats cast and crew of 500 to Haw...,www.dailymail.co.uk/tvshowbiz/article-5645343/...,www.dailymail.co.uk,11,0
652,Big shoes to fill! AnnaSophia Robb swaps her h...,https://www.dailymail.co.uk/tvshowbiz/article-...,www.dailymail.co.uk,65,1
16617,Esteban Loaiza,https://en.wikipedia.org/wiki/Esteban_Loaiza,en.wikipedia.org,17,1
5536,Jill Zarin Net Worth,https://www.celebritynetworth.com/richest-cele...,www.celebritynetworth.com,1,1
17745,"You Are My Friend News, Casting, Release Date",https://www.townandcountrymag.com/leisure/arts...,www.townandcountrymag.com,76,1
10881,BUSTED: Russian Mansions Obama Seized Were Mea...,dailyfeed.news/russian-mansions-obama-seized-w...,dailyfeed.news,42,0
20069,Justin Bieber and Selena Gomez Go to the Airpo...,https://frostsnow.com/justin-bieber-and-selena...,frostsnow.com,113,1


In [7]:
df.describe()

Unnamed: 0,tweet_num,real
count,23196.0,23196.0
mean,88.956803,0.751897
std,488.694592,0.431921
min,0.0,0.0
25%,11.0,1.0
50%,37.0,1.0
75%,65.0,1.0
max,29060.0,1.0


In [8]:
df.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [9]:
# fill the null values with the empty string by making the new dataframe
df_1=df.fillna(' ')

In [10]:
df_1.isnull().sum()

title            0
news_url         0
source_domain    0
tweet_num        0
real             0
dtype: int64

In [11]:
# merging the two cols title and source_domain as content
df_1['content']=df_1['source_domain']+" "+df_1['title']

In [12]:
df_1.sample(5)

Unnamed: 0,title,news_url,source_domain,tweet_num,real,content
8826,Amal Clooney Wears a Lingerie-Inspired Top Lik...,https://www.kartiavelino.com/2018/04/amal-cloo...,www.kartiavelino.com,31,1,www.kartiavelino.com Amal Clooney Wears a Ling...
17129,Lena Headey Recalls How Harvey Weinstein Made ...,https://www.huffingtonpost.com/entry/game-of-t...,www.huffingtonpost.com,23,1,www.huffingtonpost.com Lena Headey Recalls How...
3288,Nick Viall Regrets Telling Ashley Iaconetti Sh...,https://www.usmagazine.com/celebrity-news/news...,www.usmagazine.com,62,1,www.usmagazine.com Nick Viall Regrets Telling ...
14634,Zac Efron searches for love on Room Raiders,https://ew.com/tv/2017/04/24/zac-efron-room-ra...,ew.com,59,1,ew.com Zac Efron searches for love on Room Rai...
11045,Pregnant Khloe Kardashian Shows Off Bare Baby ...,https://www.usmagazine.com/celebrity-moms/news...,www.usmagazine.com,13,1,www.usmagazine.com Pregnant Khloe Kardashian S...


In [13]:
df_2=df_1.drop(columns=['source_domain','title'])

In [14]:
df_2.head()

Unnamed: 0,news_url,tweet_num,real,content
0,http://toofab.com/2017/05/08/real-housewives-a...,42,1,toofab.com Kandi Burruss Explodes Over Rape Ac...
1,https://www.today.com/style/see-people-s-choic...,0,1,www.today.com People's Choice Awards 2018: The...
2,https://www.etonline.com/news/220806_sophia_bu...,63,1,www.etonline.com Sophia Bush Sends Sweet Birth...
3,https://www.dailymail.co.uk/news/article-33655...,20,1,www.dailymail.co.uk Colombian singer Maluma sp...
4,https://www.zerchoo.com/entertainment/gossip-g...,38,1,www.zerchoo.com Gossip Girl 10 Years Later: Ho...


In [15]:
X=df_2.drop(columns='real')
y=df_2.iloc[:,[2]]

stemming : stemming is processs of reducing word to its root word 
example : actor , actress , acting --> act

In [16]:
port_stem=PorterStemmer()

In [17]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content) # replace words other then the alphabet with space 
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content


In [18]:
df_2['content']=df_2['content'].apply(stemming)

In [19]:
print(df_2['content'])

0        toofab com kandi burruss explod rape accus rea...
1        www today com peopl choic award best red carpe...
2        www etonlin com sophia bush send sweet birthda...
3        www dailymail co uk colombian singer maluma sp...
4        www zerchoo com gossip girl year later upper e...
                               ...                        
23191    www express co uk pippa middleton wed case mis...
23192    hollywoodlif com zayn malik gigi hadid shock s...
23193    www justjar com jessica chastain recal moment ...
23194    www intouchweekli com tristan thompson feel du...
23195    www billboard com kelli clarkson perform medle...
Name: content, Length: 23196, dtype: object


In [36]:
X1=df_2['content'].values
y1=df_2['real'].values

In [37]:
X1,y1


(array(['toofab com kandi burruss explod rape accus real housew atlanta reunion video',
        'www today com peopl choic award best red carpet look',
        'www etonlin com sophia bush send sweet birthday messag one tree hill co star hilari burton breyton eva',
        ...,
        'www justjar com jessica chastain recal moment mother boyfriend slap kick genit',
        'www intouchweekli com tristan thompson feel dump khlo kardashian refus let move la home exclus',
        'www billboard com kelli clarkson perform medley kendrick lamar humbl hit billboard music award'],
       dtype=object),
 array([1, 1, 1, ..., 1, 0, 1]))

In [38]:
# converting the textual data into numerical so the machine can understand
vectorizer=TfidfVectorizer()
X1=vectorizer.fit_transform(X1)
print(X1)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 246417 stored elements and shape (23196, 14435)>
  Coords	Values
  (0, 12980)	0.2946734912504806
  (0, 2460)	0.0508218665886508
  (0, 6647)	0.38749399933436923
  (0, 1707)	0.38749399933436923
  (0, 4193)	0.35506164978733906
  (0, 10223)	0.302158398468634
  (0, 59)	0.2597097788538541
  (0, 10276)	0.23128301802698967
  (0, 5891)	0.26124620267491605
  (0, 651)	0.30737565657585053
  (0, 10569)	0.2543303205934995
  (0, 13681)	0.21108934405813046
  (1, 2460)	0.08159478582438673
  (1, 14274)	0.10189043319535994
  (1, 12946)	0.3681704416327513
  (1, 9476)	0.2369741457556781
  (1, 2219)	0.42251489448694257
  (1, 729)	0.3083771474680274
  (1, 1101)	0.3384429489202671
  (1, 10347)	0.37702288594702094
  (1, 1916)	0.3875096512688879
  (1, 7432)	0.3328445485941059
  (2, 2460)	0.04356960323673351
  (2, 14274)	0.05440697837598742
  (2, 4091)	0.16595541576226283
  :	:
  (23194, 3695)	0.31679451408881903
  (23194, 4159)	0.24664304708295876
  

***train -test - split***

In [39]:
X_train,X_test,y_train,y_test=train_test_split(X1,y1,test_size=0.2,stratify=y,random_state=42)

In [40]:
model=LogisticRegression()

In [41]:
model.fit(X_train,y_train)

In [42]:
y_pred=model.predict(X_test)
accuracy_score(y_pred,y_test)

0.8620689655172413

**predictive system**

In [48]:
X_new=X_test[0]
prediction=model.predict(X_new)
print(prediction)
if(prediction[0]==0):
    print('real')
else:
    print('fake')    


[1]
fake


In [49]:
print(y_test[0])

1
