#### Problem Statement :

In [1]:
# We need to detect whether a news is real or fake, and since this is a binary classification problem we'll use Logistic regression

#### Workflow :

In [2]:
# -------------------------------
#           WORK FLOW
# -------------------------------
#
# 1. News Data
#    - Input dataset containing news.
#
# 2. Data Pre-processing
#    - Clean and prepare the data for model training.
#
# 3. Train-Test Split
#    - Split the dataset into training and testing sets.
#
# 4. Logistic Regression Model
#    - Build and train a Logistic Regression model on the training data.
#
# 5. Trained Logistic Regression Model
#    - Evaluate the model on the test data and use it for predictions.
#
# -------------------------------


#### Importing dependencies/Libraries :

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\RAJVIR
[nltk_data]     THAKUR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import pandas as pd
import numpy as np
import re  # Reguar expression library
from nltk.corpus import stopwords  # stopwords are the words that doesn't add much value to context of para eg.. a,the,an etc
from nltk.stem.porter import PorterStemmer  # Gives us the root word after stemming
from sklearn.feature_extraction.text import TfidfVectorizer  # Convert text to feature vectors(Numerical numbers) because computer understands only no.s
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

#### Data collection and pre processing :

In [6]:
news_dataset = pd.read_csv('Merged_True_Fake.csv')

In [7]:
news_dataset.head()

Unnamed: 0,title,text,subject,date,T_or_F
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",T
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",T
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",T
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",T
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",T


In [8]:
news_dataset.shape

(22448, 5)

In [9]:
news_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
T_or_F     0
dtype: int64

In [10]:
# Creating a new column by name content and combining title and subject and using it to train the model
# This is because training the model using text column which is very big will cost us a lot on memory and time

news_dataset['content'] = news_dataset['subject'] + ' ' + news_dataset['title']

In [11]:
news_dataset.head()

Unnamed: 0,title,text,subject,date,T_or_F,content
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",T,"politicsNews As U.S. budget fight looms, Repub..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",T,politicsNews U.S. military to accept transgend...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",T,politicsNews Senior U.S. Republican senator: '...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",T,politicsNews FBI Russia probe helped by Austra...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",T,politicsNews Trump wants Postal Service to cha...


In [12]:
print(news_dataset.loc[(0, ['content'])])

content    politicsNews As U.S. budget fight looms, Repub...
Name: 0, dtype: object


#### Breaking dataset into data and labels :

In [13]:
X = news_dataset.drop('T_or_F', axis = 1)
Y = news_dataset['T_or_F']

print(X, '\n\n-------------', Y)

                                                   title  \
0      As U.S. budget fight looms, Republicans flip t...   
1      U.S. military to accept transgender recruits o...   
2      Senior U.S. Republican senator: 'Let Mr. Muell...   
3      FBI Russia probe helped by Australian diplomat...   
4      Trump wants Postal Service to charge 'much mor...   
...                                                  ...   
22443  HOW LOS ANGELES COUNTY Helps Illegal Aliens Ge...   
22444  RADICAL DIRECTOR OF SIERRA CLUB: Abortion is t...   
22445  MAXINE WATERS TELLS THE “GREATEST DESIRE” FOR ...   
22446  MMA FIGHTER JAKE SHIELDS Embarrasses Cowards I...   
22447  LOVIN’ IT! ARIZONA DUNKIN’ DONUTS Stirs It Up ...   

                                                    text       subject  \
0      WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1      WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2      WASHINGTON (Reuters) - The special counsel inv... 

#### Stemming :

Stemming is reducing a word to its root word, this is done to save space, memory because more the memory more time and processing it would take for
training

eg.. :
actor, actress, acting -> act

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
    stemmed_content = re.sub('^a-zA-Z', ' ', content)
    # This will repace everything which is not a lowercase and uppercase letter with a space

    stemmed_content = stemmed_content.lower()
    # This will convert the stemmed_content to lowercase

    stemmed_content = stemmed_content.split()
    # This will split the string into list of individual words separated by space

    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    # This will make a new list of words after removing all the stopwords

    stemmed_content = ' '.join(stemmed_content)
    # This joins the words in the list back into a single string, with each word separated by a space

    return stemmed_content

In [16]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [17]:
print(news_dataset['content'])

0        politicsnew u.s. budget fight looms, republica...
1        politicsnew u.s. militari accept transgend rec...
2        politicsnew senior u.s. republican senator: 'l...
3        politicsnew fbi russia probe help australian d...
4        politicsnew trump want postal servic charg 'mu...
                               ...                        
22443    polit lo angel counti help illeg alien get fre...
22444    polit radic director sierra club: abort key “s...
22445    polit maxin water tell “greatest desire” nasti...
22446    polit mma fighter jake shield embarrass coward...
22447    polit lovin’ it! arizona dunkin’ donut stir tr...
Name: content, Length: 22448, dtype: object


In [18]:
X = news_dataset['content'].values
Y = news_dataset['T_or_F'].values

print(X, '\n\n-------------\n', Y)

['politicsnew u.s. budget fight looms, republican flip fiscal script'
 'politicsnew u.s. militari accept transgend recruit monday: pentagon'
 "politicsnew senior u.s. republican senator: 'let mr. mueller job'" ...
 'polit maxin water tell “greatest desire” nasti dems: “to lead trump right impeachment” [video]'
 'polit mma fighter jake shield embarrass coward mask violent 20-on-1 beat trump support [video]: “i berkeley watch man get beat mob polic help…i person jump help”'
 'polit lovin’ it! arizona dunkin’ donut stir trump wall donut [video]'] 

-------------
 ['T' 'T' 'T' ... 'F' 'F' 'F']


#### Vectorization :

In [19]:
# Converting textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 226316 stored elements and shape (22448, 12831)>
  Coords	Values
  (0, 1729)	0.29596258270092796
  (0, 4445)	0.28966371761402937
  (0, 4506)	0.4154502363517113
  (0, 4560)	0.39991061993161703
  (0, 6885)	0.4872334851225801
  (0, 8672)	0.09010672891075222
  (0, 9496)	0.19022335426531167
  (0, 10064)	0.46289638299796143
  (1, 310)	0.39820389928370425
  (1, 7361)	0.3520254451744846
  (1, 7509)	0.4525468371889061
  (1, 8444)	0.37744486205238253
  (1, 8672)	0.09933257988899553
  (1, 9306)	0.4744089500874905
  (1, 11659)	0.36785549262586315
  (2, 6276)	0.3164792671724377
  (2, 6707)	0.3540352852541786
  (2, 7613)	0.5037432586280913
  (2, 7622)	0.36550806505317945
  (2, 8672)	0.09805792162783061
  (2, 9496)	0.20700903239763688
  (2, 10143)	0.4395362561365352
  (2, 10147)	0.37614659119301364
  (3, 935)	0.41252426834934613
  (3, 3394)	0.34317531712182
  :	:
  (22446, 6332)	0.22656693110672232
  (22446, 7037)	0.15334989982992703
  (22

#### Breaking our data into Training and Testing data :

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [21]:
print(X.shape, X_train.shape, X_test.shape)

(22448, 12831) (17958, 12831) (4490, 12831)


#### Training the model :

In [22]:
model = LogisticRegression()

In [23]:
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


#### Evaluation :

In [24]:
# Calculating the accuracy of training data :
training_data_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(training_data_prediction, Y_train)

print("Accuracy for training data is : ", training_data_accuracy)

Accuracy for training data is :  0.9999443145116382


In [25]:
# Calculating the accuracy of testing data :
testing_data_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(testing_data_prediction, Y_test)

print("Accuracy for testing data is : ", testing_data_accuracy)

Accuracy for testing data is :  0.999554565701559


#### Making a predictive system :

In [30]:
input = X_test[0]

prediction = model.predict(input)

if( prediction[0] == 'F' ):
    print("The news is fake")
else :    
    print("The news is real")

The news is fake


## End