# Importing Libraries And DataSet

In [26]:
import numpy as np #provides mathematical arrays and support for arrays
import pandas as pd #used for data pre-processing
import re #support for regular expression (searching)
from nltk.corpus import stopwords # the,for,in,of -> stopwords (common english words)
from nltk.stem.porter import PorterStemmer #reducing words to their word stem or root form (played,playing == play)
from sklearn.feature_extraction.text import TfidfVectorizer #used for converting a collection of raw documents into a matrix(play = [0.45878644]) 
from sklearn.model_selection import train_test_split #used to split arrays into subsets
from sklearn.linear_model import LogisticRegression #algorithm used for modeling binary outcomes
from sklearn.metrics import accuracy_score #used to compute the accuracy classification score
import streamlit as st # Used for creating web applications with simple Python scripts.

In [9]:
news_df = pd.read_csv('WELFake_Dataset.csv')

In [10]:
news_df.head() #used to view the dataset

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


# About The DataSet
- id: unique id for a news article
- title: the title of a news article
- text: the text of the article; could be incomplete
- label: a label that marks whether the news article is real or fake:
- 1: Fake news
- 0: real News

# Preprocessing the DataSet

In [11]:
news_df.isna().sum() #used to check which attributes contain how many null values

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [14]:
news_df = news_df.fillna(' ') #fill the null value with ' '

In [15]:
news_df.isna().sum() #now no null value remains

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [19]:
news_df['content'] = news_df['title'] #creating copy of data so we can process the data without changing the original data

In [20]:
news_df #viewing the dataset(new attribute is addded that is content (copy of title))

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,1,,Did they post their votes for Hillary already?,1,
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...
...,...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,Russians steal research on Trump in hack of U....
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,WATCH: Giuliani Demands That Democrats Apolog...
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,Migrants Refuse To Leave Train At Refugee Camp...
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,Trump tussle gives unpopular Mexican leader mu...


# Stemming:

Stemming is the process of reducing a word to its Root word

example: hung    ,     hanged   ,     hanging === hang

### Steps:
First we only take characters from a-z and A-Z and removing all other characters like /,. etc
Converting all words to lower case                 
Splitting the words                             
Removing stopwords                              
Stemming    

In [18]:
ps = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [28]:
news_df['content'] = news_df['content'].apply(stemming) #apply stemming to content attribute

In [31]:
news_df['content'] #stemming is applied

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rai hindu use stori christian con...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131           migrant refu leav train refug camp hungari
72132    trump tussl give unpopular mexican leader much...
72133            goldman sach endor hillari clinton presid
Name: content, Length: 72134, dtype: object

# Separating the text and label of Dataset

In [37]:
x = news_df['content'].values #x stores values of content attribute
y = news_df['label'].values #y stores value of label attribute

In [38]:
print(x) #checking x

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'
 ''
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
 ... 'migrant refu leav train refug camp hungari'
 'trump tussl give unpopular mexican leader much need shot arm'
 'goldman sach endor hillari clinton presid']


# Vectorizing the textual data

In [39]:
vector = TfidfVectorizer() #creating object of TfidfVectorizer
vector.fit(x) #fitting the value of x in vector
x = vector.transform(x) #transform the data of x using vector

In [40]:
print(x) #checking if data is transformed

  (0, 18925)	0.19134939529376566
  (0, 18470)	0.1297506867782943
  (0, 17194)	0.2542650376115143
  (0, 17091)	0.24871262252022117
  (0, 9594)	0.22829788917209384
  (0, 7796)	0.26746434949988324
  (0, 6647)	0.48553136502134386
  (0, 6343)	0.28932771754845743
  (0, 5446)	0.31820565801047196
  (0, 3639)	0.24871262252022117
  (0, 1783)	0.33473541566384035
  (0, 402)	0.3190180925014663
  (2, 18470)	0.13443733492985524
  (2, 17862)	0.35962437110547785
  (2, 16284)	0.1999703023632961
  (2, 14935)	0.1609967301122813
  (2, 14434)	0.3580030298678158
  (2, 13446)	0.22687620695463123
  (2, 12612)	0.27904818164471595
  (2, 11892)	0.16878852994653004
  (2, 11748)	0.2231406266784195
  (2, 7928)	0.2692285294185893
  (2, 6797)	0.2652283770602196
  (2, 2895)	0.3639616996972358
  (2, 2651)	0.30809679188606154
  :	:
  (72130, 1749)	0.4930570872695346
  (72130, 755)	0.3981687060056149
  (72131, 17447)	0.3977823726603013
  (72131, 14032)	0.3211557126073331
  (72131, 14030)	0.35376168075989517
  (72131, 1082

# Splitting the data for training and testing the model

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state=2)

# Training the Model: Logistic Regression

In [47]:
model = LogisticRegression()
model.fit(x_train,y_train) #training the model