Practical - 10 : Implement text processing with LSTM (Long Short-Term Memory).

In [1]:
# Import libraries
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Sample dataset
texts = [
    "NLP is used in chatbots and sentiment analysis.",
    "Support vector machines perform well in classification.",
    "India won the cricket match yesterday.",
    "Virat Kohli scored a century in the World Cup.",
    "New AI models are transforming the tech industry.",
    "Football and cricket are the most popular sports in India.",
    "GPU acceleration helps train deep learning models.",
    "Sachin Tendulkar is a legendary cricket player.",
    "The final match of the IPL was thrilling and close.",
    "Text classification is a common NLP task."
]

labels = ["tech", "tech", "sports", "sports", "tech", "sports", "tech", "sports", "sports", "tech"]

In [3]:
# Encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(labels)
y = to_categorical(y)

In [4]:
# Tokenize and convert text to sequences
tokenizer = Tokenizer(num_words=1000, lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [5]:
# Pad sequences to equal length
max_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_length, padding='post')

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Build LSTM model
model = Sequential([
    Embedding(input_dim=1000, output_dim=64, input_length=max_length),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dense(y.shape[1], activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [8]:
# Train model
model.fit(X_train, y_train, epochs=10, batch_size=2, verbose=1)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.1405 - loss: 0.6996   
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.8024 - loss: 0.6883
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.6690 - loss: 0.6829
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.6190 - loss: 0.6763
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.6190 - loss: 0.6667
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 1.0000 - loss: 0.6410
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 1.0000 - loss: 0.6097
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 1.0000 - loss: 0.5534
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x7ac8113d7d90>

In [9]:
# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\n🔹 Test Accuracy: {accuracy:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 431ms/step - accuracy: 1.0000 - loss: 0.5772

🔹 Test Accuracy: 1.00
