### Importing the Dependencies

In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [55]:
# printing the stopwords in English
stop_words = stopwords.words("english")
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

### Data Pre-Processing

In [15]:
#loading the dataset into pandas dataframe
news_dataset = pd.read_csv("WELFake_Dataset.csv")

In [17]:
news_dataset.shape

(72134, 4)

In [19]:
# print first 5 rows in the data frame
news_dataset.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [25]:
news_dataset.rename(columns = {"Unnamed: 0": "id"}, inplace = True)

In [27]:
news_dataset.head(5)

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [33]:
# counding the number of missing values in the dataset
news_dataset.isnull().sum()

id         0
title    558
text      39
label      0
dtype: int64

In [35]:
# replacing the null values into empty string
news_dataset = news_dataset.fillna("")

In [37]:
news_dataset.isnull().sum()

id       0
title    0
text     0
label    0
dtype: int64

In [41]:
# seperating the data and label
X = news_dataset.drop(columns = "label", axis = 1)
Y = news_dataset["label"]

In [45]:
print(X)
print(Y)

          id                                              title  \
0          0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1          1                                                      
2          2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3          3  Bobby Jindal, raised Hindu, uses story of Chri...   
4          4  SATAN 2: Russia unvelis an image of its terrif...   
...      ...                                                ...   
72129  72129  Russians steal research on Trump in hack of U....   
72130  72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131  72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132  72132  Trump tussle gives unpopular Mexican leader mu...   
72133  72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  
0      No comment is expected from Barack Obama Membe...  
1         Did they post their votes for Hillary already?  
2       Now, most 

### Stemming
#### Stemming is the process to reducing a word to it's root word
##### Example: Actor, Acting, Actress --> Act (prefix and suffix will be removed and main root word will be captured)

In [53]:
port_stem = PorterStemmer()

In [59]:
def stemming(content):
    stemmed_content = re.sub("[^a-zA-Z]", " ", content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stop_words]
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [61]:
news_dataset["title"] = news_dataset["title"].apply(stemming)

In [67]:
news_dataset.tail(5)

Unnamed: 0,id,title,text,label
72129,72129,russian steal research trump hack u democrat p...,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,72130,watch giuliani demand democrat apolog trump ra...,"You know, because in fantasyland Republicans n...",1
72131,72131,migrant refus leav train refuge camp hungari,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,72132,trump tussl give unpopular mexican leader much...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0
72133,72133,goldman sach endors hillari clinton presid,Goldman Sachs Endorses Hillary Clinton For Pre...,1


In [69]:
print(news_dataset["title"])

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: title, Length: 72134, dtype: object


In [81]:
# seperating the data and label
X = news_dataset["title"].values # gives you a NumPy array of the titles
Y = news_dataset["label"].values # gives you a NumPy array of the titles

In [83]:
print(X)

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'
 ''
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
 ... 'migrant refus leav train refuge camp hungari'
 'trump tussl give unpopular mexican leader much need shot arm'
 'goldman sach endors hillari clinton presid']


In [85]:
print(Y)

[1 1 1 ... 0 0 1]


In [87]:
Y.shape

(72134,)

In [89]:
# converting the textual data into numerical data
# Term frequency Inverse document frequency (TFIDF) is a statistical formula to convert text documents into vectors based on the relevancy of the word.
vectorizer = TfidfVectorizer()

vectorizer.fit(X)
X = vectorizer.transform(X)

In [91]:
print(X)

  (0, 407)	0.3190180925014663
  (0, 1802)	0.33473541566384035
  (0, 3679)	0.24871262252022117
  (0, 5509)	0.31820565801047196
  (0, 6425)	0.28932771754845743
  (0, 6730)	0.48553136502134386
  (0, 7887)	0.26746434949988324
  (0, 9699)	0.22829788917209384
  (0, 17260)	0.24871262252022117
  (0, 17363)	0.2542650376115143
  (0, 18648)	0.1297506867782943
  (0, 19106)	0.19134939529376566
  (2, 1049)	0.28404017886581956
  (2, 2673)	0.30809679188606154
  (2, 2919)	0.3639616996972358
  (2, 6880)	0.2652283770602196
  (2, 8020)	0.2692285294185893
  (2, 11864)	0.2231406266784195
  (2, 12011)	0.16878852994653004
  (2, 12744)	0.27904818164471595
  (2, 13591)	0.22687620695463123
  (2, 14591)	0.3580030298678158
  (2, 15094)	0.1609967301122813
  (2, 16446)	0.1999703023632961
  (2, 18034)	0.35962437110547785
  :	:
  (72130, 17778)	0.13227219506940732
  (72130, 18936)	0.25302499393443006
  (72131, 2566)	0.3967249021272091
  (72131, 8206)	0.46269177743112333
  (72131, 9752)	0.3384827653769501
  (72131, 109

### spliting the dataset into taining and test data

In [105]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

### Training the model: Logistic Regression

![image.png](attachment:6e2495b9-83bc-48bd-ba3a-1202781401cc.png)

In [112]:
model = LogisticRegression()

In [114]:
model.fit(X_train, Y_train)

### Evaluation
#### Accuracy Score

In [119]:
# accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print("Accuracy score of the training data:", training_data_accuracy)

Accuracy score of the training data: 0.9193858630668723


In [121]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print("Accuracy score of the test data:", test_data_accuracy)

Accuracy score of the test data: 0.900603035974215


### Making a Predictive System

In [198]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if prediction[0] == 0:
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [200]:
print(Y_test[3])

0
