# End-to-end project on LSTM :
## ===========        Fake news classification  ===========
* We have downloaded the dataset from kaggle.
* Dataset consists of 2 different csv files.Each having 10000  rows.
* Each file having 4 columns i.e 'Title' , "Subject" , " text" and  "Date".

In [10]:
# import pandas and numpy libraries
import pandas as pd
import numpy as np
import warnings


In [11]:
# Data Ingestion
df1 = pd.read_csv('real news.xls')
df2 = pd.read_csv("fake news.xls")

In [12]:
df1.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [13]:
df2.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [14]:
# Checking for missing values
print(df1.isnull().sum(),'\n')
print(df2.isnull().sum())

title      0
text       0
subject    0
date       0
dtype: int64 

title      0
text       0
subject    0
date       0
dtype: int64


In [15]:
# first of all we need to merge both of them.
## but we need to add a seperate column for indicating whether it belongs to real or fake news

df1['fake/real'] = [1 for i in df1.index]        # df1 corressponds to true news
df2['fake/real'] = [0 for i in df1.index]        # df2 corressponds to fake news

In [16]:
# Now we can concat together
df = pd.concat([df1,df2],axis = 0,ignore_index=True)

In [17]:
# Now we are essentially shuffling the dataset
df = df.sample(frac=1)
df.reset_index(inplace=True,drop=True)

In [18]:
df.head()

Unnamed: 0,title,text,subject,date,fake/real
0,"Thanks To Capitalism, It Just Got A Lot Harde...",Just like with every presidential inauguration...,News,"December 12, 2016",0
1,CNN’s Don Lemon: Trump Shouldn’t Get Apology ...,If Donald Trump really wants an apology from E...,News,"September 18, 2017",0
2,"On His Way Out The Door, Harry Reid Gives Big...",Assuming Hillary Clinton wins Tuesday s electi...,News,"November 6, 2016",0
3,"Meeting Israel's Netanyahu, Trump backs away f...",WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,"February 15, 2017",1
4,Colbert’s Rundown Of The ‘Harriet Tubman On T...,Stephen Colbert hit the nail right on the head...,News,"April 21, 2016",0


In [76]:
# looking if our above operation works well
print(df['fake/real'] .nunique())
print(df['title'] .nunique())

2
19889


# Data pre-processing

In [19]:
 # step 1. Converting text into int or vector format
         # removing punctuations and stopwords using nltk lib.
         # stemming or lemmatization

In [20]:
# declaring dependent and independent variables here only, as it will be easier to work on it then
X = df['title']                     # Although we can include text column also but for now we use title column 
y = df['fake/real']                  

In [21]:
# importing nltk and other req. libraries for preprocessing
import nltk
from nltk.corpus import stopwords

##### for regex
import re                              

from nltk.stem import PorterStemmer        ###### for stemming words
p_stem = PorterStemmer()

######stop_words in english
stop_words = stopwords.words('english')

In [22]:
# step1.
def text_preprocessing(sent):
    
    new_sent = re.sub('[^A-Za-z]'," ",sent).strip()
    new_sent.lower()
    words = new_sent.split()
    
    process_words = [p_stem.stem(word) for word in words if word not in stop_words]
    processed_sentence = " ".join(process_words)
    return processed_sentence
### applying it on X (independent variable)    
X = df['title'].apply(text_preprocessing)   
X.head()

0    thank to capit it just got a lot harder for tr...
1    cnn don lemon trump shouldn get apolog from es...
2    on hi way out the door harri reid give big f c...
3    meet israel netanyahu trump back away commit p...
4    colbert rundown of the harriet tubman on the n...
Name: title, dtype: object

In [23]:
# step 2. Tokenization using One hot encoding 
# step 3. padding as per the requirement
from tensorflow.keras.preprocessing.text import one_hot
vocab_size = 5000
X_onehot_repr = [one_hot(sentence,n=vocab_size) for sentence in X]

In [24]:
# padding the one hot rep.
from tensorflow.keras.preprocessing.sequence import pad_sequences
embedded_docs = pad_sequences(X_onehot_repr, padding = 'post')

In [25]:
print(len(embedded_docs[0]))
print(len(embedded_docs[7]))

36
36


In [26]:
# we now prepare our final feature set and target set
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [27]:
X_final.shape, y_final.shape

((20000, 36), (20000,))

## Model training

In [28]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.35, random_state = 42)


In [29]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13000, 36), (7000, 36), (13000,), (7000,))

In [30]:
# importing req. libraries from TensorFlow
from tensorflow import keras
from keras.layers import LSTM,Embedding,Dense
from keras.models import Sequential

In [34]:
# Model Architecture
model = Sequential()
model.add(Embedding(input_dim = vocab_size,output_dim=50,input_length=36))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 36, 50)            250000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               60400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 310,501
Trainable params: 310,501
Non-trainable params: 0
_________________________________________________________________


In [35]:
# model compile
model.compile(metrics = ['acc'], optimizer = 'adam', loss = 'binary_crossentropy')

In [36]:
# model fitting
model.fit(X_train, y_train, epochs = 5, validation_data = (X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1d596306500>

In [37]:
# now we will predict for our test data , Remember it will give probablities only.
y_pred = model.predict(X_test)




In [59]:
y_pred[0]

array([0.00016573], dtype=float32)

In [77]:
# Converting probablities into meaningful result so that we can compare with the actual values.
predictions = []
for i in y_pred:
    if i <0.5 :
        predictions.append(0)
    else:
        predictions.append(1)
        
        
y_predicted = np.array(predictions)              # converting this into an array

### Model Evaluation

In [71]:
# now we can find out accuracy and other evaluation metrices
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3481
           1       0.99      0.98      0.98      3519

    accuracy                           0.98      7000
   macro avg       0.98      0.98      0.98      7000
weighted avg       0.98      0.98      0.98      7000

