# Text and Sequence Assignment
## Author : Dev, Sakshi

We will be using IMDB data for this text and sequence problem. Firstly, we need to create a validation set with 80% of training dataset and setting apart 20% for training.

### Reading Data

In [1]:
import os, shutil , pathlib , random
base_dir = pathlib.Path("/Users/devmarwah/Documents/MSBA assignments/Advanced Machine Learning/Text and Sequencing/IMDB data")
val_dir= base_dir/"validation"
train_dir=base_dir/"train"
for category in ("neg","pos"):
    os.makedirs(val_dir/category)
    files= os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_val_samples = 5000
    val_file = files[:num_val_samples]
    for fname in val_file:
        shutil.move(train_dir/category/fname,
                   val_dir/category,fname)

Making a small training sample as well : 

In [2]:
train_dir_1 =base_dir/"train1"
for category in ("neg","pos"):
    os.makedirs(train_dir_1/category)
    files= os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_train_samples = 50
    train_file = files[:num_train_samples]
    for fname in train_file:
        shutil.move(train_dir/category/fname,
                   train_dir_1/category,fname)

Reading our datasets : 

In [3]:
from tensorflow import keras
batch_size = 32
train = keras.utils.text_dataset_from_directory(train_dir_1,batch_size=batch_size)
validation=keras.utils.text_dataset_from_directory(val_dir,batch_size=batch_size)
test=keras.utils.text_dataset_from_directory(base_dir/"test",batch_size=batch_size)

Found 100 files belonging to 2 classes.
Found 10000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


### Trying sequencing model 

Preparing dataset for this model : 

In [4]:
from tensorflow.keras import layers
max_length = 150 # Cutting off values after 150 words
max_tokens = 10000 # Considering only top 10,000 words
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_only_train_ds = train.map(lambda x, y: x) 
# Turning text to vectors
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = validation.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

2024-04-18 17:37:16.381985: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


**Model Construction - Embedding Layer**

In [5]:
 import tensorflow as tf
inputs=keras.Input(shape=(None,), dtype="int64")
embedded= layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
# We have turned mask on because training bi-directional LSTM can take longer time 
x= layers.Bidirectional(layers.LSTM(32))(embedded)
x=layers.Dropout(0.5)(x)
outputs= layers.Dense(1,activation="sigmoid")(x)

In [6]:
model = keras.Model(inputs, outputs) 
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
callbacks = [
    keras.callbacks.ModelCheckpoint("/Users/devmarwah/Documents/MSBA assignments/Advanced Machine Learning/Text and Sequencing/one_hot_bidir_lstm.keras",
                                    save_best_only=True)
]
model.summary()

Fitting the model on our testing dataset

In [7]:
model.fit(int_train_ds,
         validation_data=int_val_ds,
         epochs=10,
         callbacks=callbacks)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.5226 - loss: 0.6923 - val_accuracy: 0.5208 - val_loss: 0.6926
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - accuracy: 0.6973 - loss: 0.6854 - val_accuracy: 0.5232 - val_loss: 0.6923
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.8601 - loss: 0.6714 - val_accuracy: 0.5186 - val_loss: 0.6922
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.8448 - loss: 0.6633 - val_accuracy: 0.5199 - val_loss: 0.6919
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.8954 - loss: 0.6559 - val_accuracy: 0.5259 - val_loss: 0.6916
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - accuracy: 0.9232 - loss: 0.6397 - val_accuracy: 0.5279 - val_loss: 0.6909
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x1689e1810>

Testing this model 

In [8]:
print("\n Model's accuracy:",round(model.evaluate(int_test_ds)[1]*100,2),"%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 24ms/step - accuracy: 0.5600 - loss: 0.6828

 Model's accuracy: 56.23 %


Hence, our first model's accuracy with LSTM and embedding is just **55.44%** which is quite low. We will now try a pre-trained word embedding.

**Model Construction - Pretrained word embedded**

Parsing after downloading the glove pretrained word-embedding.

In [9]:
import numpy as np
path_to_glove_file = "/Users/devmarwah/Documents/MSBA assignments/Advanced Machine Learning/Text and Sequencing/glove.6B/glove.6B.100d.txt"

embeddings_index={}
with open(path_to_glove_file) as f:
    for line in f:
        words,coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[words] = coefs

Preparing a matrix of GloVe :

In [10]:
embedding_dim=100
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary,range(len(vocabulary))))
embedding_matrix = np.zeros((max_tokens,embedding_dim))
for word, i in word_index.items():
    if i<max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None : 
        embedding_matrix[i] = embedding_vector

Making an embedding layer with this embedded matrix : 

In [11]:
embedding_layer= layers.Embedding(max_tokens,
                                 embedding_dim,
                                 embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                 trainable=False,
                                 mask_zero=True)

Making a final model with pretrained work-embedding : 

In [12]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()
callbacks = [ keras.callbacks.ModelCheckpoint("/Users/devmarwah/Documents/MSBA assignments/Advanced Machine Learning/Text and Sequencing/glove_embeddings_sequence_model.keras",
                                    save_best_only=True)
]

Training this model on our training dataset : 

In [13]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
callbacks=callbacks)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - accuracy: 0.5003 - loss: 0.7560 - val_accuracy: 0.5022 - val_loss: 0.7021
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - accuracy: 0.5502 - loss: 0.6938 - val_accuracy: 0.5117 - val_loss: 0.6968
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.5424 - loss: 0.6876 - val_accuracy: 0.5035 - val_loss: 0.7022
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step - accuracy: 0.5520 - loss: 0.6754 - val_accuracy: 0.5103 - val_loss: 0.6973
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - accuracy: 0.6671 - loss: 0.6499 - val_accuracy: 0.5125 - val_loss: 0.6940
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - accuracy: 0.6009 - loss: 0.6393 - val_accuracy: 0.5268 - val_loss: 0.6924
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x168a67610>

Testing this model on our testing data :

In [14]:
print("\n Model's Accuracy:",round(model.evaluate(int_test_ds)[1]*100,2))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.5165 - loss: 0.7021

 Model's Accuracy: 51.73


Pre-trained embedding is not really helpful in this case. Hence, training from scratch worked better for this dataset.
Now, we will try to increase training sample size and then train our model again.

**Increasing training size by 7000 samples**

In [15]:
for category in ("neg","pos"):
    files= os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_train_samples = 3500
    train_file = files[:num_train_samples]
    for fname in train_file:
        shutil.move(train_dir/category/fname,
                   train_dir_1/category,fname)

Making a training dataset again : 

In [16]:
train = keras.utils.text_dataset_from_directory(train_dir_1,batch_size=batch_size)
int_train_ds = train.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

Found 7100 files belonging to 2 classes.


Training the last pretrained embedding model with new training dataset : 

In [17]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
callbacks=callbacks)

Epoch 1/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 89ms/step - accuracy: 0.5874 - loss: 0.6782 - val_accuracy: 0.7055 - val_loss: 0.5807
Epoch 2/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 94ms/step - accuracy: 0.6922 - loss: 0.5781 - val_accuracy: 0.7061 - val_loss: 0.5598
Epoch 3/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 96ms/step - accuracy: 0.7612 - loss: 0.4990 - val_accuracy: 0.7653 - val_loss: 0.4767
Epoch 4/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 94ms/step - accuracy: 0.8021 - loss: 0.4375 - val_accuracy: 0.8066 - val_loss: 0.4251
Epoch 5/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 91ms/step - accuracy: 0.8249 - loss: 0.3936 - val_accuracy: 0.8144 - val_loss: 0.4328
Epoch 6/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 90ms/step - accuracy: 0.8488 - loss: 0.3627 - val_accuracy: 0.7846 - val_loss: 0.4833
Epoch 7/10
[1m2

<keras.src.callbacks.history.History at 0x175d0f490>

Testing the model now : 

In [18]:
print("\n Model's Accuracy:",round(model.evaluate(int_test_ds)[1]*100,2))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.8229 - loss: 0.4395

 Model's Accuracy: 82.13


Increasing samples did not really increase any acuracy. 

**Increasing training sample again by 7000**

In [19]:
for category in ("neg","pos"):
    files= os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_train_samples = 3500
    train_file = files[:num_train_samples]
    for fname in train_file:
        shutil.move(train_dir/category/fname,
                   train_dir_1/category,fname)

Reading new training set:

In [20]:
train = keras.utils.text_dataset_from_directory(train_dir_1,batch_size=batch_size)
int_train_ds = train.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

Found 14100 files belonging to 2 classes.


Training this model again : 

In [21]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10)

Epoch 1/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 81ms/step - accuracy: 0.8695 - loss: 0.3274 - val_accuracy: 0.8276 - val_loss: 0.4064
Epoch 2/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 82ms/step - accuracy: 0.8780 - loss: 0.2927 - val_accuracy: 0.8290 - val_loss: 0.4444
Epoch 3/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 80ms/step - accuracy: 0.8955 - loss: 0.2648 - val_accuracy: 0.8247 - val_loss: 0.4618
Epoch 4/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 79ms/step - accuracy: 0.9038 - loss: 0.2386 - val_accuracy: 0.8301 - val_loss: 0.4729
Epoch 5/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 79ms/step - accuracy: 0.9152 - loss: 0.2195 - val_accuracy: 0.8306 - val_loss: 0.4761
Epoch 6/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 79ms/step - accuracy: 0.9247 - loss: 0.1934 - val_accuracy: 0.8190 - val_loss: 0.5349
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x17b4bba50>

Testing this model : 

In [22]:
print("\n Model's Accuracy:",round(model.evaluate(int_test_ds)[1]*100,2))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.8240 - loss: 0.6487

 Model's Accuracy: 81.96
