<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/NLP%20with%20Tensorflow/Text%20Classification%20using%20Glove%20Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram 

In [1]:
from google.colab import files

files.upload()


! mkdir ~/.kaggle


! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [2]:
!kaggle datasets download -d columbine/imdb-dataset-sentiment-analysis-in-csv-format

Downloading imdb-dataset-sentiment-analysis-in-csv-format.zip to /content
 66% 17.0M/25.7M [00:01<00:01, 5.65MB/s]
100% 25.7M/25.7M [00:01<00:00, 18.3MB/s]


In [3]:
!unzip /content/imdb-dataset-sentiment-analysis-in-csv-format.zip

Archive:  /content/imdb-dataset-sentiment-analysis-in-csv-format.zip
  inflating: Test.csv                
  inflating: Train.csv               
  inflating: Valid.csv               


### Importing Dependencies

In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
from datetime import datetime
import unicodedata
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
from functools import lru_cache
from nltk.corpus import stopwords
import re
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
df = pd.read_csv("/content/Train.csv")

In [6]:
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [7]:
df["label"].value_counts()

0    20019
1    19981
Name: label, dtype: int64

### Cleaning the dataset

In [11]:
@lru_cache(maxsize=128)
def clean_text(text):

    stp_wrd = stopwords.words("english")

    text = re.sub(fr"[{string.punctuation}]", " ", text)

    # Unicode the text
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf8")
                                                                                        
    text = list(
        filter(
            None,
            map(
                lambda x: x.strip() if x.lower() not in stp_wrd else None,
                text.split(),
            ),
        )
    )

    lem = nltk.stem.wordnet.WordNetLemmatizer()  # Lemmatization

    text = " ".join(map(lambda x: lem.lemmatize(x), text))

    text = re.sub("\s+", " ", text)  # Remove extra spaces

    return text

In [12]:
df["text"] = df["text"].apply(clean_text)

### Taking X train and Y train

In [13]:
X_train = df["text"].values
y_train = df["label"].values

In [14]:
y_train

array([0, 0, 0, ..., 0, 1, 1])

### Tokenization

In [15]:
vocab_size = 20000

In [16]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<oov>", lower = False, char_level = False)

tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)

### Padding

In [17]:
max_len = 256

X_train = pad_sequences(X_train, maxlen = max_len, padding = 'post')

### Taking Pre-Trained Glove Word Embeddings

In [None]:
! wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip

In [19]:
!unzip /content/glove.840B.300d.zip

Archive:  /content/glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [20]:
embeddings_index = {}

f = open("/content/glove.840B.300d.txt", encoding = "utf8")

for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2195893 word vectors.


### Creating Word Embedding Matrix

In [21]:
word_index = tokenizer.word_index

num_words = min(vocab_size, len(word_index)) + 1

emb_dim = 300
embedding_matrix = np.zeros((num_words, emb_dim))


for word, i in word_index.items():
    if i > vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix.
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector.
        embedding_matrix[i] = np.random.randn(emb_dim)

### Building Bi-Directional LSTM

In [23]:
model = tf.keras.models.Sequential([
                                     
         tf.keras.layers.Embedding(num_words, emb_dim,
                                   embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
                                   trainable = True, input_length = max_len),

         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),
         tf.keras.layers.BatchNormalization(),

         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
         tf.keras.layers.BatchNormalization(),

         tf.keras.layers.Dense(64, activation = "relu"),

         tf.keras.layers.Dense(1, activation = "sigmoid")
])

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 256, 300)          6000300   
                                                                 
 bidirectional_2 (Bidirectio  (None, 256, 128)         186880    
 nal)                                                            
                                                                 
 batch_normalization_2 (Batc  (None, 256, 128)         512       
 hNormalization)                                                 
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 batch_normalization_3 (Batc  (None, 128)              512       
 hNormalization)                                      

### Compiling the model

In [25]:
model.compile(loss = "binary_crossentropy",
                 optimizer = "adam",
                 metrics = ["accuracy"])

### Callbacks

In [26]:
checkpoint = tf.keras.callbacks.ModelCheckpoint("model.h5", monitor = "val_loss", save_best_only = True, 
                                                verbose = 1)

earlystopping = tf.keras.callbacks.EarlyStopping( monitor = "val_loss", verbose = 1, restore_best_weights = True,
                                                 patience = 3)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = "val_loss", factor = 0.2,
                              patience = 2, min_lr = 0.001, verbose = 1)


In [27]:
def scheduler(epoch, lr):
  if epoch < 5:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose = 1)

In [28]:
callbacks = [checkpoint, earlystopping, reduce_lr, lr_schedule]

### Taking Test and Val dataset

In [29]:
test = pd.read_csv("/content/Test.csv")
val = pd.read_csv("/content/Valid.csv")

test["text"] = test["text"].apply(clean_text)
val["text"] = val["text"].apply(clean_text)

X_test = df["text"].values
y_test = df["label"].values

X_val = df["text"].values
y_val = df["label"].values

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen = max_len)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen = max_len)

### Training the model

In [30]:
history = model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 5, batch_size = 64, callbacks = callbacks)


Epoch 00001: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.52716, saving model to model.h5

Epoch 00002: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 2/5
Epoch 00002: val_loss did not improve from 0.52716

Epoch 00003: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 3/5
Epoch 00003: val_loss improved from 0.52716 to 0.37778, saving model to model.h5

Epoch 00004: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 4/5
Epoch 00004: val_loss did not improve from 0.37778

Epoch 00005: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 5/5
Epoch 00005: val_loss did not improve from 0.37778
