In [4]:
pip install nltk



**Importing Importing Libraries**

In [None]:
# Importing
import pandas as pd
import numpy as np

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot  # converting to one hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense  # since it is a classification problem need layers

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


**Reading Data**

In [4]:
df = pd.read_csv("/content/drive/MyDrive/data/train.csv")

In [5]:
df.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


**Feature Engineering And Data Processing**

In [6]:
df = df.dropna()

X = df.drop('label', axis=1)
y = df['label']

print(X.shape, y.shape)

voc_size = 5000

# One Hot Representation for title
message = X.copy()
message.reset_index(inplace=True)  # resetting because we've dropped the nan values


(18285, 4) (18285,)


In [9]:
nltk.download('stopwords')


ps = PorterStemmer()
corpus = []
for i in range(0, len(message)):
    review = re.sub('[^a-zA-Z]', ' ', message['title'][i])     # sub = substituting
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
# corpus

one_hot_repr = [one_hot(words, voc_size) for words in corpus]
# one_hot_repr


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[4364, 134, 1678, 1250, 159, 629],
 [2950],
 [4044, 1219, 3171],
 [523, 858, 887],
 [162, 1747, 4095, 162],
 [2757, 2636, 2136, 4486],
 [1678, 1250, 1219],
 [3376, 162, 463, 4486, 1219],
 [1250, 162, 2753, 4095, 1250],
 [4486],
 [523, 1219],
 [1489, 1112, 463],
 [1330, 3423, 793, 2950, 463],
 [523, 3376, 463, 1219],
 [1293, 1219],
 [4095, 4037, 1219],
 [523, 2950],
 [628, 3376],
 [162],
 [26],
 [523, 2950, 1219],
 [463, 1219, 3376, 4095],
 [4095, 1219],
 [4095, 2763, 4616],
 [4486, 162, 1219],
 [353, 3376, 1219],
 [4095, 3171, 504],
 [858, 4030, 463, 1219],
 [1707, 3928, 793, 1219],
 [162, 162, 1219],
 [4095, 1250, 2213, 1219],
 [1330, 4044, 4364, 793, 523, 1219, 159],
 [789, 1250],
 [1250, 2950],
 [4095, 1567, 26, 3376, 162, 1591, 1219],
 [4486],
 [4486, 1219, 4486, 4590, 804, 1678],
 [1195, 463, 589, 1219],
 [4095, 3785, 1250],
 [1219, 4486, 4037, 1219],
 [2950, 463, 1250],
 [1929, 1219],
 [2950, 1219, 1250, 3634, 2713],
 [1567],
 [2314, 3493, 4486, 1195],
 [1219, 3634, 2314],
 [409

**Pre - Padding to make equal length of sentence**

In [10]:
# padding
sent_length = 20
embedded_docs = pad_sequences(one_hot_repr, padding='pre', maxlen=sent_length)
embedded_docs

array([[   0,    0,    0, ..., 1250,  159,  629],
       [   0,    0,    0, ...,    0,    0, 2950],
       [   0,    0,    0, ..., 4044, 1219, 3171],
       ...,
       [   0,    0,    0, ..., 1195, 1250, 1219],
       [   0,    0,    0, ...,    0, 4095,  523],
       [   0,    0,    0, ...,    0, 2763, 1219]], dtype=int32)

In [11]:
X_final = np.array(embedded_docs)
y_final = np.array(y)
X_final.shape, y_final.shape

((18285, 20), (18285,))

**Splitting Data into Train and Test dataset**

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

**Creating Model Without dropout layer**

In [14]:
# Creating Model
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


**Model Training**

In [15]:
# Model training
model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f950aabe690>

In [16]:
# Performance  measurement
y_pred = model.predict_classes(X_test)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))




[[2441  978]
 [ 523 2093]]
0.7512841756420878


**Adding dropout Layer to check difference with and without dropout layer**

In [17]:
# hyperparameter tuning
# Adding dropout
from tensorflow.keras.layers import Dropout

embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None
