In [2]:
import numpy as np
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
news_df = pd.read_csv('Dataset.csv')

In [4]:
news_df.head()

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### About the Dataset:

id: unique id for a news article                                     
title: the title of a news article                                                                     
text: the text of the article; could be incomplete                                        
label: a label that marks whether the news article is real or fake:                              
    1: Fake news                                                 
    0: real News                                        

# 1 Preprocessing 

In [5]:
news_df.isnull().sum()

id         0
title    558
text      39
label      0
dtype: int64

In [6]:
news_df.shape

(72134, 4)

In [7]:
news_df = news_df.fillna(' ')

In [8]:
news_df.isnull().sum()

id       0
title    0
text     0
label    0
dtype: int64

In [9]:
news_df['content'] = news_df['title']

In [10]:
news_df

Unnamed: 0,id,title,text,label,content
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...
1,1,,Did they post their votes for Hillary already?,1,
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri..."
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...
...,...,...,...,...,...
72129,72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,Russians steal research on Trump in hack of U....
72130,72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,WATCH: Giuliani Demands That Democrats Apolog...
72131,72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,Migrants Refuse To Leave Train At Refugee Camp...
72132,72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,Trump tussle gives unpopular Mexican leader mu...


# separating the data & label

In [11]:
X = news_df.drop('label',axis=1)
y = news_df['label']

In [12]:
print(X)

          id                                              title  \
0          0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1          1                                                      
2          2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3          3  Bobby Jindal, raised Hindu, uses story of Chri...   
4          4  SATAN 2: Russia unvelis an image of its terrif...   
...      ...                                                ...   
72129  72129  Russians steal research on Trump in hack of U....   
72130  72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131  72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132  72132  Trump tussle gives unpopular Mexican leader mu...   
72133  72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  \
0      No comment is expected from Barack Obama Membe...   
1         Did they post their votes for Hillary already?   
2       Now, mo

# Stemming:

Stemming is the process of reducing a word to its Root word

example: hung         hanged        hanging ======hang

# Steps:
lower case                 
splitting                             
removing stopwords                              
stemming                                   

In [13]:
ps = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
news_df['content'] = news_df['content'].apply(stemming)

In [None]:
news_df['content']

# separating the data and label


In [None]:
X = news_df['content'].values
y = news_df['label'].values

# converting the textual data to numerical data

In [None]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [None]:
print(X)

# Splitting the dataset to training & test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)

In [None]:
X_train.shape

# Training the Model: Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [None]:
# on training set
train_y_pred = model.predict(X_train)
print(accuracy_score(train_y_pred,Y_train))

In [None]:
# on testing set
testing_y_pred = model.predict(X_test)
print(accuracy_score(testing_y_pred,Y_test))

# Detection System

In [None]:
input_data = X_test[10]
prediction = model.predict(input_data)

In [None]:
if prediction[0] == 0:
    print('The News Is Real')
else:
    print('The News is Fake')

In [None]:
news_df['content'][2]