# • DOMAIN: Digital content and entertainment industry
# • CONTEXT: The objective of this project is to build a text classification model that analyses the customer's sentiments based on their reviews in the IMDB database. The model uses a complex deep learning model to build an embedding layer followed by a classification algorithm to analyse the sentiment of the customers.
# • DATA DESCRIPTION: The Dataset of 50,000 movie reviews from IMDB, labelled by sentiment (positive/negative). Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). For convenience, the words are indexed by their frequency in the dataset, meaning the for that has index 1 is the most frequent word. Use the first 20 words from each review to speed up training, using a max vocabulary size of 10,000. As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.
# • PROJECT OBJECTIVE: To Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments.

# Steps and tasks:
# 1. Import and analyse the data set.

In [3]:
# Import all the necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

from keras.datasets import imdb
from keras.models import Sequential
from keras import layers,models
from keras.layers import Dense, Input
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,Flatten

import tensorflow as tf

from wordcloud import WordCloud,STOPWORDS
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata

from textblob import TextBlob
from textblob import Word

import warnings
warnings.filterwarnings('ignore')

In [58]:
# Import and load IMDB data set

from keras.datasets import imdb

(X_TRAIN, Y_TRAIN), (X_TEST, Y_TEST) = imdb.load_data(num_words = 10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [59]:
print('SHAPE OF X_TRAIN: ',X_TRAIN.shape)

print('SHAPE OF Y_TRAIN: ',Y_TRAIN.shape)

print('SHAPE OF X_TEST: ',X_TEST.shape)

print('SHAPE OF Y_TEST: ',Y_TEST.shape)

SHAPE OF X_TRAIN:  (25000,)
SHAPE OF Y_TRAIN:  (25000,)
SHAPE OF X_TEST:  (25000,)
SHAPE OF Y_TEST:  (25000,)


In [60]:
FEATURES = np.concatenate((X_TRAIN, X_TEST), axis = 0)

LABELS = np.concatenate((Y_TRAIN, Y_TEST), axis = 0)

In [61]:
print("COUNT OF UNIQUE WORDS:", len(np.unique(np.hstack(FEATURES))))

COUNT OF UNIQUE WORDS: 9998


In [62]:
print("LABELS:", np.unique(LABELS))

LABELS: [0 1]


# 2. Perform relevant sequence adding on the data.

In [63]:
def VECTORIZE(SEQUENCES, DIMENSION = 10000):
    RESULTS = np.zeros((len(SEQUENCES), DIMENSION))
    for i, sequence in enumerate(SEQUENCES):
        RESULTS[i, sequence] = 1
    return RESULTS

In [64]:
FEATURES_COPY = FEATURES.copy()
LABELS_COPY = LABELS.copy()

In [65]:
len(FEATURES[100])

158

# 3. Perform following data analysis:

    • Print shape of features and labels
    • Print value of any one feature and it's label

In [66]:
print('SHAPE OF FEATURES: ',FEATURES.shape)

print('\n\nSHAPE OF LABELS: ',LABELS.shape)

SHAPE OF FEATURES:  (50000,)


SHAPE OF LABELS:  (50000,)


In [67]:
print('FEATURE VALUE IN DATA[100]:\n',FEATURES[100])

FEATURE VALUE IN DATA[100]:
 [1, 13, 244, 6, 87, 337, 7, 628, 2219, 5, 28, 285, 15, 240, 93, 23, 288, 549, 18, 1455, 673, 4, 241, 534, 3635, 8448, 20, 38, 54, 13, 258, 46, 44, 14, 13, 1241, 7258, 12, 5, 5, 51, 9, 14, 45, 6, 762, 7, 2, 1309, 328, 5, 428, 2473, 15, 26, 1292, 5, 3939, 6728, 5, 1960, 279, 13, 92, 124, 803, 52, 21, 279, 14, 9, 43, 6, 762, 7, 595, 15, 16, 2, 23, 4, 1071, 467, 4, 403, 7, 628, 2219, 8, 97, 6, 171, 3596, 99, 387, 72, 97, 12, 788, 15, 13, 161, 459, 44, 4, 3939, 1101, 173, 21, 69, 8, 401, 2, 4, 481, 88, 61, 4731, 238, 28, 32, 11, 32, 14, 9, 6, 545, 1332, 766, 5, 203, 73, 28, 43, 77, 317, 11, 4, 2, 953, 270, 17, 6, 3616, 13, 545, 386, 25, 92, 1142, 129, 278, 23, 14, 241, 46, 7, 158]


In [68]:
print('LABEL VALUE IN LABELS[100]: ',LABELS[100])

LABEL VALUE IN LABELS[100]:  0


# # 4. Decode the feature value to get original sentence

In [69]:
WORD_INDEX = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [70]:
REVERSE_WORD_MAP = dict(map(reversed, WORD_INDEX.items()))

def SEQUENCE_TO_TEXT(INDEX):
    WORDS = [REVERSE_WORD_MAP.get(letter) for letter in INDEX]
    return(WORDS)

In [71]:
VALUE_OF_FEATURE = SEQUENCE_TO_TEXT(FEATURES[100])

print(VALUE_OF_FEATURE)

['the', 'was', 'rather', 'is', 'him', 'completely', 'br', 'english', 'send', 'to', 'one', 'dvd', 'for', 'kind', 'way', 'are', 'year', 'type', 'but', 'tired', 'talent', 'of', 'am', 'stories', 'slightest', 'coop', 'on', 'her', 'no', 'was', 'although', 'some', 'has', 'as', 'was', 'garbage', "che's", 'that', 'to', 'to', 'when', 'it', 'as', 'if', 'is', 'herself', 'br', 'and', 'door', 'simply', 'to', 'picture', '25', 'for', 'he', 'silent', 'to', 'holy', 'dramatically', 'to', 'bigger', 'reason', 'was', 'then', 'does', 'sorry', 'very', 'not', 'reason', 'as', 'it', 'out', 'is', 'herself', 'br', "film's", 'for', 'with', 'and', 'are', 'of', 'tension', '4', 'of', 'human', 'br', 'english', 'send', 'in', 'could', 'is', 'again', 'outrageous', 'movies', 'episode', 'we', 'could', 'that', 'elements', 'for', 'was', 'nothing', 'laugh', 'has', 'of', 'holy', 'laughing', 'lot', 'not', 'me', 'in', 'perfect', 'and', 'of', 'totally', 'most', 'only', 'dreary', '2', 'one', 'an', 'this', 'an', 'as', 'it', 'is', 'f

In [72]:
WORD_IDX = imdb.get_word_index()

WORDS_DICT = dict([(value,key) for (key,value) in WORD_IDX.items()])

In [73]:
print(" ".join([WORDS_DICT.get(n-3, '#') for n in FEATURES[100]]))

# i am a great fan of david lynch and have everything that he's made on dvd except for hotel room the 2 hour twin peaks movie so when i found out about this i immediately grabbed it and and what is this it's a bunch of # drawn black and white cartoons that are loud and foul mouthed and unfunny maybe i don't know what's good but maybe this is just a bunch of crap that was # on the public under the name of david lynch to make a few bucks too let me make it clear that i didn't care about the foul language part but had to keep # the sound because my neighbors might have all in all this is a highly disappointing release and may well have just been left in the # box set as a curiosity i highly recommend you don't spend your money on this 2 out of 10


In [74]:
FEATURES = VECTORIZE(FEATURES)
LABELS = np.array(LABELS).astype("float32")

# 5. Design, train, tune and test a sequential model.

In [75]:
X_TRAIN, X_T, Y_TRAIN, Y_T = train_test_split(FEATURES, LABELS, test_size = 0.5, random_state = 100)

X_TEST, X_VAL, Y_TEST, Y_VAL = train_test_split(X_T, Y_T, test_size = 0.5, random_state = 100)

In [76]:
X_TEST.shape

(12500, 10000)

In [77]:
SEQ_MODEL = models.Sequential()

SEQ_MODEL.add(layers.Dense(50, activation = "relu", input_shape=(10000, )))

SEQ_MODEL.add(layers.Dropout(0.3, noise_shape=None, seed=None))
SEQ_MODEL.add(layers.Dense(50, activation = "relu"))
SEQ_MODEL.add(layers.Dropout(0.2, noise_shape=None, seed=None))
SEQ_MODEL.add(layers.Dense(50, activation = "relu"))

SEQ_MODEL.add(layers.Dense(1, activation = "sigmoid"))
SEQ_MODEL.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                500050    
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 505,201
Trainable params: 505,201
Non-trainable params: 0
________________________________________________

In [78]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [79]:
SEQ_MODEL.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

In [80]:
MODEL_RESULTS = SEQ_MODEL.fit(X_TRAIN, Y_TRAIN, epochs = 100, batch_size = 50, 
                              validation_data = (X_VAL, Y_VAL), callbacks = [callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


In [81]:
ACCURACY = SEQ_MODEL.evaluate(X_TEST, Y_TEST,batch_size = 500)

print("MODEL_ACCURACY: ", ACCURACY)

MODEL_ACCURACY:  [0.8100147843360901, 0.8791199922561646]


In [82]:
PREDICTIONS = SEQ_MODEL.predict(X_TEST)

In [83]:

print('VALIDATION SET ACCURACY: ',np.mean(MODEL_RESULTS.history["val_accuracy"]))



print('\nTRAINING SET ACCURACY: ',np.mean(MODEL_RESULTS.history["accuracy"]))

VALIDATION SET ACCURACY:  0.8806541246526381

TRAINING SET ACCURACY:  0.9836482300477869


    * OUR MODEL IS ALREADY RUNNING AT MEAN ACCURACY OF 97.9% FOR TRAINING DATA AND 
      88% OF MEAN ACCURACY FOR VALIDATION DATA SET WITH BASE PARAMETERS

# 6. Use the designed model to print the prediction on any one sample.

In [84]:
PREDICTIONS = np.round(PREDICTIONS, 0)

In [85]:
PREDICTIONS = PREDICTIONS.ravel()

In [86]:
PREDICTIONS.shape

(12500,)

In [87]:
PREDICTIONS = PREDICTIONS.astype('int64')

In [88]:
PREDICTIONS.ravel()

PREDICTIONS

array([1, 1, 0, ..., 1, 0, 0])

In [89]:
PREDICTIONS[100]

1

In [90]:
print(classification_report(Y_TEST, PREDICTIONS, target_names=['POSITIVE SENTIMENTS','NEGATIVE SENTIMENTS']))

                     precision    recall  f1-score   support

POSITIVE SENTIMENTS       0.90      0.86      0.88      6263
NEGATIVE SENTIMENTS       0.86      0.90      0.88      6237

           accuracy                           0.88     12500
          macro avg       0.88      0.88      0.88     12500
       weighted avg       0.88      0.88      0.88     12500



In [91]:
SEQUENCE_TO_TEXT(X_TEST[100])

[None,
 'the',
 'the',
 None,
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 None,
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 None,
 'the',
 'the',
 'the',
 'the',
 None,
 'the',
 None,
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 'the',
 None,
 None,
 'the',
 None,
 None,
 None,
 None,
 'the',
 None,
 None,
 None,
 None,
 'the',
 'the',
 None,
 'the',
 None,
 None,
 None,
 None,
 'the',
 None,
 None,
 'the',
 'the',
 'the',
 'the',
 None,
 'the',
 None,
 None,
 None,
 None,
 'the',
 'the',
 None,
 None,
 'the',
 None,
 None,
 None,
 None,
 None,
 None,
 'the',
 'the',
 None,
 None,
 None,
 'the',
 'the',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'the',
 'the',
 None,
 None,
 None,
 None,
 None,
 None,
 'the',
 None,
 None,
 'the',
 None,
 None,
 'the',
 'the',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'the',
 None,
 None,


In [92]:
Y_TEST[100]

1.0

In [93]:
PREDICTIONS[100]

1

    * OUR MODEL HAS CORRECTLY PREDICTED THE SENTIMENT FOR 1OOTH TEST SAMPLE WHICH IS POSITIVE.

In [96]:
SEQ_MODEL.compile(optimizer = tf.keras.optimizers.Adam(lr = 0.0045), loss = "binary_crossentropy", metrics = ["accuracy"])

MODEL_RESULTS1 = SEQ_MODEL.fit(X_TRAIN, Y_TRAIN, epochs = 100, batch_size = 50, 
                              validation_data = (X_VAL, Y_VAL), callbacks = [callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


In [97]:
ACCURACY = round((SEQ_MODEL.evaluate(X_TEST, Y_TEST)[1])*100,2)

print('ACCURACY ON TESTING DATA: ',ACCURACY)

ACCURACY ON TESTING DATA:  87.81


In [98]:
PREDICTIONS = SEQ_MODEL.predict(X_TEST)

PREDICTIONS = np.round(PREDICTIONS, 0)

PREDICTIONS = PREDICTIONS.ravel()

PREDICTIONS = PREDICTIONS.astype('int64')

PREDICTIONS.ravel()

PREDICTIONS

array([1, 1, 0, ..., 1, 0, 0])

In [99]:
Y_TEST[56]

0.0

In [100]:
PREDICTIONS[56]

0

# • DOMAIN: Social media analytics
# • CONTEXT: Past studies in Sarcasm Detection mostly make use of Twitter datasets collected using hashtag based supervision but such datasets are noisy in terms of labels and language. Furthermore, many tweets are replies to other tweets and detecting sarcasm in these requires the availability of contextual tweets.In this hands-on project, the goal is to build a model to detect whether a sentence is sarcastic or not, using Bidirectional LSTMs.
# • DATA DESCRIPTION:
# The dataset is collected from two news websites, theonion.com and huffingtonpost.com.
# This new dataset has the following advantages over the existing Twitter datasets:
# Since news headlines are written by professionals in a formal manner, there are no spelling mistakes and informal usage. This reduces the sparsity and also increases the chance of finding pre-trained embeddings.
# Furthermore, since the sole purpose of TheOnion is to publish sarcastic news, we get high-quality labels with much less noise as compared to Twitter datasets.
# Unlike tweets that reply to other tweets, the news headlines obtained are self-contained. This would help us in teasing apart the real sarcastic elements
# Content: Each record consists of three attributes:
# is_sarcastic: 1 if the record is sarcastic otherwise 0
# headline: the headline of the news article
# article_link: link to the original news article. Useful in collecting supplementary data
# Reference: https://github.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection
# • PROJECT OBJECTIVE: Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments.

# Steps and tasks:

# 1. Read and explore the data

In [4]:
import json

DATA = pd.read_json("../input/sarcasm-headlines-dataset/Sarcasm_Headlines_Dataset.json",lines = True)

In [5]:
DATA.columns

Index(['is_sarcastic', 'headline', 'article_link'], dtype='object')

In [6]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [7]:
DATA.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [8]:
DATA.tail()

Unnamed: 0,is_sarcastic,headline,article_link
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...
28618,1,dad clarifies this not a food stop,https://www.theonion.com/dad-clarifies-this-no...


In [9]:
DATA.shape

(28619, 3)

In [10]:
import nltk
nltk.download('omw-1.4')

# CONVERT ALL THE UPPERCASE TEXT TO LOWER CASE
DATA["headline"] = DATA["headline"].apply(lambda x: " ".join(x.lower() for x in x.split()))

# DELETE ALL THE PUNCTUATION MARKS FROM THE TEXT
DATA["headline"] = DATA["headline"].str.replace('[^\w\s]','')

# DELETE NUMBERS
DATA["headline"] = DATA["headline"].str.replace('\d','')

# DELETE STOP WORDS
STOPWORDS = stopwords.words("english")
DATA["headline"] = DATA["headline"].apply(lambda x: " ".join(x for x in x.split() if x not in STOPWORDS))

# LEMMATIZATION OF THE TEXT DATA
DATA["headline"] = DATA["headline"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [11]:
DATA.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientist unveil doomsday cloc...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep totally nail congress falling short ge...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat veggie deliciously different recipe,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar getting work,https://local.theonion.com/inclement-weather-p...
4,1,mother come pretty close using word streaming ...,https://www.theonion.com/mother-comes-pretty-c...


# 2. Retain relevant columns

In [12]:
# LINK TO ARTICLES IS NOT IMPORTANT FOR ANALYSYS OF THE DATA

DATA = DATA.drop(columns = "article_link")

In [13]:
DATA.columns

Index(['is_sarcastic', 'headline'], dtype='object')

In [14]:
DATA.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientist unveil doomsday cloc...
1,0,dem rep totally nail congress falling short ge...
2,0,eat veggie deliciously different recipe
3,1,inclement weather prevents liar getting work
4,1,mother come pretty close using word streaming ...


In [15]:
DATA.tail()

Unnamed: 0,is_sarcastic,headline
28614,1,jew celebrate rosh hashasha something
28615,1,internal affair investigator disappointed cons...
28616,0,beautiful acceptance speech week came queer ko...
28617,1,mar probe destroyed orbiting spielberggates sp...
28618,1,dad clarifies food stop


# 3. Get length of each sentence

In [16]:
DATA['SENTENCE_LENGTH'] = DATA['headline'].apply(lambda x: len(x))

In [17]:
DATA.head()

Unnamed: 0,is_sarcastic,headline,SENTENCE_LENGTH
0,1,thirtysomething scientist unveil doomsday cloc...,57
1,0,dem rep totally nail congress falling short ge...,66
2,0,eat veggie deliciously different recipe,39
3,1,inclement weather prevents liar getting work,44
4,1,mother come pretty close using word streaming ...,55


In [18]:
DATA.tail()

Unnamed: 0,is_sarcastic,headline,SENTENCE_LENGTH
28614,1,jew celebrate rosh hashasha something,37
28615,1,internal affair investigator disappointed cons...,70
28616,0,beautiful acceptance speech week came queer ko...,50
28617,1,mar probe destroyed orbiting spielberggates sp...,56
28618,1,dad clarifies food stop,23


# 4. Define parameters

In [19]:
MAX_FEATURES = 10000
MAX_LEN = 25
EMBEDDING_SIZE = 200

# 5. Get indices for words

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

TOKENIZER = Tokenizer()

TOKENIZER.fit_on_texts(DATA['headline'])

In [21]:
INDEX_DICT = TOKENIZER.word_index

# 6. Create features and labels

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

SEQUENCE = TOKENIZER.texts_to_sequences(DATA['headline'])

PAD_SEQ = pad_sequences(sequences = SEQUENCE, maxlen = MAX_LEN)

    * I AM SPLITTING THE DATA IN BELOW RATIO:
    
        * 60% OF DATA KEPT FOR TRAINING THE MODEL
        * 20% OF THE DATA KEPT FOR VALIDATING THE MODEL
        * 20% OF THE DATA FOR TESTING THE MODEL

In [26]:
FEATURES = DATA['headline']
LABELS = DATA['is_sarcastic']

In [27]:
X_TRAIN, X_T, Y_TRAIN, Y_T = train_test_split(PAD_SEQ, LABELS, test_size = 0.4, random_state = 100)

X_VAL, X_TEST, Y_VAL, Y_TEST = train_test_split(X_T, Y_T, test_size = 0.5, random_state = 100)

In [28]:
print('SHAPE OF X_TRAIN: ',X_TRAIN.shape)

print('\nSHAPE OF Y_TRAIN: ',Y_TRAIN.shape)

print('\nSHAPE OF X_TEST: ',X_TEST.shape)

print('\nSHAPE OF Y_TEST: ',Y_TEST.shape)

print('\nSHAPE OF X_VAL: ',X_VAL.shape)

print('\nSHAPE OF Y_VAL: ',Y_VAL.shape)

SHAPE OF X_TRAIN:  (17171, 25)

SHAPE OF Y_TRAIN:  (17171,)

SHAPE OF X_TEST:  (5724, 25)

SHAPE OF Y_TEST:  (5724,)

SHAPE OF X_VAL:  (5724, 25)

SHAPE OF Y_VAL:  (5724,)


# 7. Get vocabulary size

In [29]:
SIZE_OF_VOCAB = len(INDEX_DICT) + 1

SIZE_OF_VOCAB

25083

In [30]:
TOKENIZER.word_index

{'trump': 1,
 'new': 2,
 'man': 3,
 'woman': 4,
 'say': 5,
 'report': 6,
 'get': 7,
 'u': 8,
 'day': 9,
 'one': 10,
 'year': 11,
 'time': 12,
 'make': 13,
 'american': 14,
 'area': 15,
 'donald': 16,
 'like': 17,
 'life': 18,
 'first': 19,
 'people': 20,
 'nation': 21,
 'way': 22,
 'house': 23,
 'world': 24,
 'thing': 25,
 'show': 26,
 'white': 27,
 'still': 28,
 'find': 29,
 'obama': 30,
 'back': 31,
 'take': 32,
 'family': 33,
 'clinton': 34,
 'child': 35,
 'want': 36,
 'state': 37,
 'could': 38,
 'know': 39,
 'school': 40,
 'right': 41,
 'call': 42,
 'mom': 43,
 'study': 44,
 'need': 45,
 'black': 46,
 'gop': 47,
 'would': 48,
 'kid': 49,
 'go': 50,
 'bill': 51,
 'president': 52,
 'friend': 53,
 'look': 54,
 'yearold': 55,
 'love': 56,
 'plan': 57,
 'watch': 58,
 'really': 59,
 'death': 60,
 'home': 61,
 'parent': 62,
 'america': 63,
 'good': 64,
 'police': 65,
 'cant': 66,
 'best': 67,
 'video': 68,
 'star': 69,
 'going': 70,
 'work': 71,
 'war': 72,
 'student': 73,
 'last': 74,
 '

# 8. Create a weight matrix using GloVe embeddings

In [31]:
NUM_OF_WORDS = len(INDEX_DICT) + 1

NUM_OF_WORDS

25083

In [32]:
GLOVE_FILE = '../input/glove-6b/glove.6B.100d.txt'

EMBEDDINGS_INDEX = {}

FILE = open(GLOVE_FILE)

for i in FILE:
    VALUES = i.split()
    WORDS = VALUES[0]
    CO_EFFS = np.asarray(VALUES[1], dtype = 'float32')
    EMBEDDINGS_INDEX[WORDS] = CO_EFFS
    
FILE.close()

In [33]:
EMBEDDINGS_FILE = '../input/glove-6b/glove.6B.200d.txt'

EMBEDDINGS = {}

for i in open(EMBEDDINGS_FILE):
    WORDS = i.split(" ")[0]
    EMBEDS = i.split(" ")[1:]
    EMBEDS = np.asarray(EMBEDS, dtype = 'float32')
    EMBEDDINGS[WORDS] = EMBEDS
    
EMBEDDING_MATRIX = np.zeros((NUM_OF_WORDS,200))

for WORD, i in TOKENIZER.word_index.items():
    EMBEDDINGS_VECTOR = EMBEDDINGS.get(WORDS)
    if EMBEDDINGS_VECTOR is not None:
        EMBEDDING_MATRIX[i] = EMBEDDINGS_VECTOR

# 9. Define and compile a Bidirectional LSTM model.

In [34]:
LSTM_MODEL = Sequential()
LSTM_MODEL.add(Embedding(NUM_OF_WORDS, EMBEDDING_SIZE, weights=[EMBEDDING_MATRIX], input_length=MAX_LEN))
LSTM_MODEL.add(Bidirectional(LSTM(units=128 ,dropout = 0.5,return_sequences=True)))
LSTM_MODEL.add(Flatten())
LSTM_MODEL.add(Dense(1, activation='sigmoid'))
LSTM_MODEL.summary()

2022-10-09 06:03:59.328333: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 200)           5016600   
_________________________________________________________________
bidirectional (Bidirectional (None, 25, 256)           336896    
_________________________________________________________________
flatten (Flatten)            (None, 6400)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 6401      
Total params: 5,359,897
Trainable params: 5,359,897
Non-trainable params: 0
_________________________________________________________________


In [35]:
LSTM_MODEL.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

In [36]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [37]:
Y_TRAIN = np.array(Y_TRAIN)

Y_VAL = np.array(Y_VAL)

Y_TEST = np.array(Y_TEST)

In [38]:
X_TRAIN = np.array(X_TRAIN)

X_VAL = np.array(X_VAL)

X_TEST = np.array(X_TEST)

# 10. Fit the model and check the validation accuracy

In [39]:
LSTM_HISTORY = LSTM_MODEL.fit(X_TRAIN, Y_TRAIN, validation_data = (X_VAL, Y_VAL), 
                              epochs = 10, callbacks = [callback])

2022-10-09 06:04:18.328661: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
print('MEAN VALIDATION DATA ACCURACY: ',round((np.mean(LSTM_HISTORY.history['val_accuracy'])*100),2))

MEAN VALIDATION DATA ACCURACY:  78.4


In [41]:
print('MEAN TRAINING DATA ACCURACY: ',round((np.mean(LSTM_HISTORY.history['accuracy'])*100),2))

MEAN TRAINING DATA ACCURACY:  93.94


In [42]:
ACCURACY = round((LSTM_MODEL.evaluate(X_TEST, Y_TEST)[1])*100,2)

ACCURACY



77.5

In [43]:
print('ACCURACY ON TESTING DATA: ',ACCURACY)

ACCURACY ON TESTING DATA:  77.5


In [44]:
PREDICTIONS = LSTM_MODEL.predict(X_TEST)

In [45]:
PREDICTIONS = np.round(PREDICTIONS, 0)

PREDICTIONS = PREDICTIONS.ravel()

PREDICTIONS = PREDICTIONS.astype('int64')

PREDICTIONS.ravel()

PREDICTIONS

array([1, 1, 0, ..., 1, 1, 1])

In [46]:
Y_TEST[56]

0

In [47]:
PREDICTIONS[56]

0

In [48]:
# TRY WITH LR RATE = 0.01

LSTM_MODEL.compile(optimizer = tf.keras.optimizers.Adam(lr = 0.01), 
                   loss = "binary_crossentropy", metrics = ["accuracy"])

In [49]:
LSTM_HISTORY1 = LSTM_MODEL.fit(X_TRAIN, Y_TRAIN, validation_data = (X_VAL, Y_VAL), 
                              epochs = 10, callbacks = [callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [50]:
print('MEAN VALIDATION DATA ACCURACY: ',round((np.mean(LSTM_HISTORY1.history['val_accuracy'])*100),2))

print('\nMEAN TRAINING DATA ACCURACY: ',round((np.mean(LSTM_HISTORY1.history['accuracy'])*100),2))

MEAN VALIDATION DATA ACCURACY:  76.37

MEAN TRAINING DATA ACCURACY:  97.89


In [51]:
# TRY WITH LR RATE = 0.0045

LSTM_MODEL.compile(optimizer = tf.keras.optimizers.Adam(lr = 0.0045), 
                   loss = "binary_crossentropy", metrics = ["accuracy"])

In [52]:
LSTM_HISTORY2 = LSTM_MODEL.fit(X_TRAIN, Y_TRAIN, validation_data = (X_VAL, Y_VAL), 
                              epochs = 10, callbacks = [callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [53]:
print('MEAN VALIDATION DATA ACCURACY: ',round((np.mean(LSTM_HISTORY2.history['val_accuracy'])*100),2))

print('\nMEAN TRAINING DATA ACCURACY: ',round((np.mean(LSTM_HISTORY2.history['accuracy'])*100),2))

MEAN VALIDATION DATA ACCURACY:  76.46

MEAN TRAINING DATA ACCURACY:  99.88


In [54]:
ACCURACY = round((LSTM_MODEL.evaluate(X_TEST, Y_TEST)[1])*100,2)

print('ACCURACY ON TESTING DATA: ',ACCURACY)

ACCURACY ON TESTING DATA:  76.43


In [55]:
PREDICTIONS = LSTM_MODEL.predict(X_TEST)

PREDICTIONS = np.round(PREDICTIONS, 0)

PREDICTIONS = PREDICTIONS.ravel()

PREDICTIONS = PREDICTIONS.astype('int64')

PREDICTIONS.ravel()

PREDICTIONS

array([1, 1, 1, ..., 1, 0, 1])

In [56]:
Y_TEST[56]

0

In [57]:
PREDICTIONS[56]

0