<a href="https://colab.research.google.com/github/TrishKedi/AI-Coursework/blob/main/ML13_Code06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, SimpleRNN
import tarfile, shutil

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

# extract data
tarfile.open('aclImdb_v1.tar.gz').extractall('./')

--2025-01-07 10:57:24--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-01-07 10:57:43 (4.39 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
# remove folder, otherwise will create an extra class
shutil.rmtree('./aclImdb/train/unsup')

In [None]:
# load data
train_data = text_dataset_from_directory('./aclImdb/train')
test_data = text_dataset_from_directory('./aclImdb/test')

# extract only texts
train_texts = train_data.map(lambda text, label: text)

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [None]:
# print example
for text_batch, label_batch in train_data.take(1):
  print(text_batch.numpy()[0])
  print(label_batch.numpy()[0]) # 0: negative, 1: positive

b'If you like films about school bullies, brave children, hilarious toddlers and worm eating, then How to Eat Fried Worms will appeal to you.<br /><br />The film is about a boy named Billy, who when arriving on his first day at a new school, discovers that some of his classmates have played a prank on him by putting worms into his lunch. The school bully, Joe and his "team" of friends start teasing Billy and calling him "worm boy".<br /><br />Billy decides to play along by saying that "he eats worms all the time". Joe and his friends don\'t believe him but Billy assures them and bets Joe that he can eat ten worms in one day otherwise he will come to school with worms in his pants.<br /><br />The boys take Billy up on his bet, leaving the weak stomached child with a mission to gain respect from his classmates by eating worms cooked, fried, or alive.<br /><br />The film may sound gross but there are a lot of messages in it. For one, it portrays true friendship and how to accept people fo

In [None]:
# prepare text vectorization
max_tokens = 1000
max_len = 100

vectorization_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_len)

vectorization_layer.adapt(train_texts)

In [None]:
model = Sequential()
model.add(Input(shape=(1,), dtype="string"))
model.add(vectorization_layer)
model.add(Embedding(max_tokens, 16))
model.add(LSTM(64))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [None]:
model.summary()

In [None]:
# Compile and train the model.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_data, epochs=10)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.5594 - loss: 0.6713
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7688 - loss: 0.4912
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7162 - loss: 0.5438
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6055 - loss: 0.6455
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7754 - loss: 0.4739
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7936 - loss: 0.4440
Epoch 7/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.8003 - loss: 0.4300
Epoch 8/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.8055 - loss: 0.4223
Epoch 9/10
[1m782/782[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7ee7d1acdab0>

In [None]:
# Evaluate the model on our test dataset.
model.evaluate(test_data)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7965 - loss: 0.4326


[0.43424782156944275, 0.7945600152015686]

In [None]:
# Predict test sentences
sentences = [
  "that was the worst film in my life",
  "that was the best film in my life",
]

# Convert the list of sentences to a TensorFlow dataset.
predict_dataset = tf.data.Dataset.from_tensor_slices(sentences)
predict_dataset = predict_dataset.batch(len(sentences))

# show prediction, 0: negative, 1: positive
print(model.predict(predict_dataset))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[[0.03325916]
 [0.5730721 ]]


In [None]:
# Use simple RNN instead of LSTM
model2 = Sequential()
model2.add(Input(shape=(1,), dtype="string"))
model2.add(vectorization_layer)
model2.add(Embedding(max_tokens, 16))
model2.add(SimpleRNN(64))
model2.add(Dense(64, activation="relu"))
model2.add(Dense(1, activation="sigmoid"))

# Compile and train the model.
model2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model2.fit(train_data, epochs=10)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 77ms/step - accuracy: 0.5015 - loss: 0.6973
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 76ms/step - accuracy: 0.5086 - loss: 0.6968
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 76ms/step - accuracy: 0.5040 - loss: 0.6968
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 76ms/step - accuracy: 0.5110 - loss: 0.6945
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 76ms/step - accuracy: 0.5169 - loss: 0.6926
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 76ms/step - accuracy: 0.5341 - loss: 0.6900
Epoch 7/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 77ms/step - accuracy: 0.5733 - loss: 0.6755
Epoch 8/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 76ms/step - accuracy: 0.6221 - loss: 0.6542
Epoch 9/10
[1m782/782[

<keras.src.callbacks.history.History at 0x7e8a8efee260>