<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/NLP%20with%20Tensorflow/Text%20Classification%20using%20Glove%20Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram 

In [1]:
from google.colab import files

files.upload()


! mkdir ~/.kaggle


! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
!kaggle datasets download -d columbine/imdb-dataset-sentiment-analysis-in-csv-format

In [None]:
!unzip /content/imdb-dataset-sentiment-analysis-in-csv-format.zip

### Importing Dependencies

In [51]:
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
from datetime import datetime
import unicodedata
import nltk
#nltk.download("all")
from functools import lru_cache
from nltk.corpus import stopwords
import re
import string

In [52]:
df = pd.read_csv("/content/Train.csv")

In [53]:
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [54]:
df["label"].value_counts()

0    20019
1    19981
Name: label, dtype: int64

### Cleaning the dataset

In [55]:
@lru_cache(maxsize=128)
def clean_text(text):

    stp_wrd = stopwords.words("english")

    text = re.sub(fr"[{string.punctuation}]", " ", text)

    # Unicode the text
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf8")
                                                                                        
    text = list(
        filter(
            None,
            map(
                lambda x: x.strip() if x.lower() not in stp_wrd else None,
                text.split(),
            ),
        )
    )

    lem = nltk.stem.wordnet.WordNetLemmatizer()  # Lemmatization

    text = " ".join(map(lambda x: lem.lemmatize(x), text))

    text = re.sub("\s+", " ", text)  # Remove extra spaces

    return text

In [56]:
df["text"] = df["text"].apply(clean_text)

### Taking X train and Y train

In [57]:
X_train = df["text"].values
y_train = df["label"].values

In [58]:
y_train

array([0, 0, 0, ..., 0, 1, 1])

### Tokenization

In [59]:
vocab_size = 20000

In [60]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<oov>", lower = False, char_level = False)

tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)

### Padding

In [61]:
max_len = 256

X_train = pad_sequences(X_train, maxlen = max_len, padding = 'post')

### Taking Pre-Trained Glove Word Embeddings

In [32]:
! wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip


2021-11-25 08:47:36 (57.7 MB/s) - ‘glove.840B.300d.zip’ saved [2176768976/2176768976]



In [33]:
!unzip /content/glove.840B.300d.zip




In [62]:
embeddings_index = {}

f = open("/content/glove.840B.300d.txt", encoding = "utf8")

for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2195893 word vectors.


### Creating Word Embedding Matrix

In [63]:
word_index = tokenizer.word_index

num_words = min(vocab_size, len(word_index)) + 1

emb_dim = 300
embedding_matrix = np.zeros((num_words, emb_dim))


for word, i in word_index.items():
    if i > vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix.
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector.
        embedding_matrix[i] = np.random.randn(emb_dim)

### Building Bi-Directional LSTM

In [64]:
model = tf.keras.models.Sequential([
                                     
         tf.keras.layers.Embedding(num_words, emb_dim,
                                   embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
                                   trainable = True, input_length = max_len),

         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),
         tf.keras.layers.BatchNormalization(),

         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
         tf.keras.layers.BatchNormalization(),

         tf.keras.layers.Dense(64, activation = "relu"),

         tf.keras.layers.Dense(1, activation = "sigmoid")
])

### Compiling the model

In [65]:
model.compile(loss = "binary_crossentropy",
                 optimizer = "adam",
                 metrics = ["accuracy"])

### Callbacks

In [66]:
checkpoint = tf.keras.callbacks.ModelCheckpoint("model.h5", monitor = "val_loss", save_best_only = True, 
                                                verbose = 1)

earlystopping = tf.keras.callbacks.EarlyStopping( monitor = "val_loss", verbose = 1, restore_best_weights = True,
                                                 patience = 3)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = "val_loss", factor = 0.2,
                              patience = 2, min_lr = 0.001, verbose = 1)


In [67]:
def scheduler(epoch, lr):
  if epoch < 5:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose = 1)

In [68]:
callbacks = [checkpoint, earlystopping, reduce_lr, lr_schedule]

### Taking Test and Val dataset

In [69]:
test = pd.read_csv("/content/Test.csv")
val = pd.read_csv("/content/Valid.csv")

test["text"] = test["text"].apply(clean_text)
val["text"] = val["text"].apply(clean_text)

X_test = df["text"].values
y_test = df["label"].values

X_val = df["text"].values
y_val = df["label"].values

In [None]:
history = model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 5, batch_size = 64, callbacks = callbacks)


Epoch 00001: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 1/5