# Importing Libraries

In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Tensorflow Version used

In [2]:
tf.__version__

'2.8.0'

# Loading Dataset

In [3]:
df = pd.read_csv("train.csv")

Top 5 rows of Dataset

In [79]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


Dataset shape

In [4]:
df.shape

(20800, 5)

Count of Null values in Dataset

In [5]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

Droping Null values

In [6]:
df = df.dropna()

Get Independent Feature

In [7]:
x = df.drop('label', axis = 1)

In [8]:
x.shape

(18285, 4)

Get Dependent Features

In [10]:
y = df['label']

In [11]:
y.shape

(18285,)

Define Vocabulary Size

In [12]:
voc_size = 5000

# Data Preprocessing

In [13]:
messages = x.copy()

In [14]:
messages.reset_index(inplace=True)

Removing Stopping Words

In [15]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [16]:
corpus[0]

'hous dem aid even see comey letter jason chaffetz tweet'

OneHot Representation of Dataset

In [17]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
onehot_repr[0]

[2222, 360, 4580, 2998, 2378, 646, 395, 4117, 310, 4078]

Embedding Representation

In [18]:
sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding= "pre", maxlen = sent_length)
print(embedded_docs)

[[   0    0    0 ... 4117  310 4078]
 [   0    0    0 ... 3531 4956  210]
 [   0    0    0 ...  345 3722  823]
 ...
 [   0    0    0 ... 3429 3322 4160]
 [   0    0    0 ... 3284 2778 3120]
 [   0    0    0 ... 3307 4590 1167]]


In [19]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 2222,
        360, 4580, 2998, 2378,  646,  395, 4117,  310, 4078])

In [20]:
len(embedded_docs)

18285

# Create Model

In [21]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation = "sigmoid"))
model.compile(loss = 'binary_crossentropy', optimizer = "adam", metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
len(embedded_docs), y.shape

(18285, (18285,))

In [23]:
x_final = np.array(embedded_docs)
y_final = np.array(y)

In [24]:
x_final.shape, y_final.shape

((18285, 20), (18285,))

Train Test Split

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size = 0.33, random_state = 42)

# Model Training

In [26]:
model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x24910762fd0>

# Performance Metrics and Accuracy

In [27]:
y_pred = (model.predict(x_test) > 0.5).astype("int32")

In [28]:
confusion_matrix(y_test, y_pred)

array([[3114,  305],
       [ 237, 2379]], dtype=int64)

In [29]:
accuracy_score(y_test, y_pred)

0.9101905550952776

# Create Model with Dropout Layer

In [30]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length))
model.add(Dropout(.3))
model.add(LSTM(100))
model.add(Dropout(.3))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [31]:
model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2491dcc9bb0>

In [32]:
y_pred = (model.predict(x_test) > 0.5).astype("int32")

# Performance Metrix and Accuracy

In [33]:
confusion_matrix(y_test, y_pred)

array([[3136,  283],
       [ 246, 2370]], dtype=int64)

In [34]:
accuracy_score(y_test, y_pred)

0.9123446561723281