## **Name:**  Goli Srikar
## **RegNo:** 22BCE9946
## **Slot No:** L43-L44

## Week 5

## Text classification application using GRU

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from sklearn.model_selection import train_test_split
from collections import defaultdict


In [None]:
# Reuters dataset
num_words = 10000  #top 10,000 words
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words)

max_len = 200
x_train = pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')

# Split training data into training and validation
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

#label names
def get_reuters_labels():
    labels = ["Cocoa", "Grain", "Vegetable Oil", "Livestock", "Cotton", "Iron Steel", "CPI", "Money FX",
              "Energy", "Ship", "Sugar", "Coffee", "Gold", "Tin", "Strategic Metals", "Grain", "Retail",
              "NAT-GAS", "Alum", "OPEC", "Palm Oil", "RUBBER", "COPPER", "COTTON", "Wool", "TEA",
              "STRATEGIC METALS", "PET-CHEM", "COCOA", "LIVESTOCK", "CORN", "WHEAT", "SUGAR", "COFFEE",
              "OILSEED", "COAL", "ORANGE", "HEATING", "RICE", "NICKEL", "SILVER", "PLATINUM", "CPI",
              "MONEY-FX", "INTEREST", "TRADE"]
    return defaultdict(lambda: "Unknown", {i: label for i, label in enumerate(labels)})

label_names = get_reuters_labels()

# sample data point
def decode_review(encoded_review):
    word_index = reuters.get_word_index()
    reverse_word_index = {value: key for key, value in word_index.items()}
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

print("Sample Data Point:")
print(decode_review(x_train[0]))
print(f"Label: {label_names[y_train[0]]}")


Sample Data Point:
? u s exporters will have the opportunity to sell an additional 300 000 tonnes of u s durum wheat to algeria under the export enhancement program eep the u s agriculture department said the department said the sales will be subsidized with commodities from the commodity credit corporation ccc inventory and the subsidy will enable u s exports to compete at commercial prices in the algerian market algeria has already purchased 300 000 tonnes of u s durum wheat under a previous export enhancement initiative announced november 10 1986 it said details of the latest initiative including an invitation for offers from exporters will be issued in the near future the department said reuter 3 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
Label: Grain


In [None]:
# GRU model
model = Sequential([
    Embedding(input_dim=num_words, output_dim=128, input_length=max_len),
    GRU(64, return_sequences=False),
    Dense(46, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_val, y_val))

# Evaluate on test data
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 302ms/step - accuracy: 0.3199 - loss: 2.9347 - val_accuracy: 0.3606 - val_loss: 2.3540
Epoch 2/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 305ms/step - accuracy: 0.3678 - loss: 2.3537 - val_accuracy: 0.5281 - val_loss: 1.9524
Epoch 3/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 301ms/step - accuracy: 0.5352 - loss: 1.8692 - val_accuracy: 0.5860 - val_loss: 1.6683
Epoch 4/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 295ms/step - accuracy: 0.5966 - loss: 1.5938 - val_accuracy: 0.6038 - val_loss: 1.6005
Epoch 5/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 307ms/step - accuracy: 0.6372 - loss: 1.4463 - val_accuracy: 0.6071 - val_loss: 1.5879
Epoch 6/10
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 324ms/step - accuracy: 0.6712 - loss: 1.3194 - val_accuracy: 0.6422 - val_loss: 1.4743
Epoch 7/10

In [None]:
# Function to predict category for a custom sentence
def predict_category(sentence):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
    word_index = reuters.get_word_index()
    words = sentence.lower().split()
    encoded_sentence = [word_index.get(word, 2) + 3 for word in words]
    padded_sentence = pad_sequences([encoded_sentence], maxlen=max_len, padding='post', truncating='post')
    prediction = model.predict(padded_sentence)
    predicted_label = prediction.argmax()
    print(f"Sentence: {sentence}")
    print(f"Predicted Category: {label_names[predicted_label]}")

predict_category("Stock market is crashing due to economic slowdown.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step
Sentence: Stock market is crashing due to economic slowdown.
Predicted Category: Livestock


## Sentiment analyzer using LSTM.

In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import LSTM

In [None]:
#IMDb movie dataset
num_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)
print(x_train[0])
print(y_train[0])

max_len = 200
x_train = pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


In [None]:

# Split training data into training and validation
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=num_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, batch_size=64, validation_data=(x_val, y_val))

# Evaluate on test data
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')



Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 301ms/step - accuracy: 0.5133 - loss: 0.6919 - val_accuracy: 0.5946 - val_loss: 0.6794
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 300ms/step - accuracy: 0.6122 - loss: 0.6616 - val_accuracy: 0.5796 - val_loss: 0.6890
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 312ms/step - accuracy: 0.6029 - loss: 0.6460 - val_accuracy: 0.7152 - val_loss: 0.5810
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 315ms/step - accuracy: 0.8525 - loss: 0.3512 - val_accuracy: 0.8538 - val_loss: 0.3409
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 300ms/step - accuracy: 0.9244 - loss: 0.2101 - val_accuracy: 0.8482 - val_loss: 0.3654
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 38ms/step - accuracy: 0.8408 - loss: 0.3858
Test Accuracy: 84.03%


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# IMDb word index
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

# Function to preprocess and predict sentiment for custom sentence
def predict_sentiment(sentence):
    tokenizer = Tokenizer(num_words=num_words)
    words = sentence.lower().split()
    encoded_sentence = [word_index.get(word, 2) + 3 for word in words]
    padded_sentence = pad_sequences([encoded_sentence], maxlen=max_len, padding='post', truncating='post')

    prediction = model.predict(padded_sentence)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"

    print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment} (Confidence: {prediction:.4f})")


predict_sentiment("This movie was fantastic and full of thrilling moments!")
predict_sentiment("I hated this movie. It was boring and had no plot.")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step
Sentence: This movie was fantastic and full of thrilling moments!
Sentiment: Positive (Confidence: 0.9808)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Sentence: I hated this movie. It was boring and had no plot.
Sentiment: Negative (Confidence: 0.1057)
