In [26]:
import numpy as np
import os
import sys
sys.path.append(os.path.abspath(r'D:\Guvi_Project\Personalized Learning Assistant\src'))
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from Model_Training import pickle_dump
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import warnings
warnings.filterwarnings("ignore")

In [27]:
# Load AG News dataset
dataset = load_dataset("ag_news")# Hugging Face dataset
texts = [item['text'] for item in dataset['train']]
labels = [item['label'] for item in dataset['train']]

In [28]:
dataset.save_to_disk(r'D:\Guvi_Project\Personalized Learning Assistant\data\raw\Topic_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/120000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7600 [00:00<?, ? examples/s]

In [29]:
texts[0]

"Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [30]:
labels[0]

2

In [31]:
# Encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

pickle_dump(le, name ='LabelEncoder-sec5')

LabelEncoder-sec5 Model saved successfully.


In [32]:
# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=100)
pickle_dump(tokenizer, name='Tokenizer-sec5')

Tokenizer-sec5 Model saved successfully.


In [33]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_encoded, test_size=0.2)

In [34]:
# Download and load GloVe (100d)
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove_data

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [35]:
# Load GloVe embeddings
embedding_index = {}
glove_file_path = r"D:\Guvi_Project\Personalized Learning Assistant\data\raw\glove.6B.100d.txt"
with open(glove_file_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

In [36]:
# Create embedding matrix
embedding_dim = 100
word_index = tokenizer.word_index
embedding_matrix = np.zeros((10000, embedding_dim))
for word, i in word_index.items():
    if i < 10000 and word in embedding_index:
        embedding_matrix[i] = embedding_index[word]

In [37]:
# Build model with GloVe embedding
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=embedding_dim,
                    weights=[embedding_matrix], input_length=100, trainable=False))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))  # AG News has 4 labels --> 0: World, 1: Sports, 2: Business, 3: Sci/Tech.

In [38]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [39]:
# Train model
model.fit(X_train, y_train, epochs=3, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/3
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 68ms/step - accuracy: 0.7878 - loss: 0.5652 - val_accuracy: 0.8941 - val_loss: 0.3121
Epoch 2/3
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 73ms/step - accuracy: 0.8988 - loss: 0.3067 - val_accuracy: 0.9059 - val_loss: 0.2746
Epoch 3/3
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 80ms/step - accuracy: 0.9033 - loss: 0.2840 - val_accuracy: 0.9054 - val_loss: 0.2723


<keras.src.callbacks.history.History at 0x20360619eb0>

In [40]:
model.summary()

In [41]:
model.save(r'D:\Guvi_Project\Personalized Learning Assistant\models\llm_models\ag_news_lstm_model.h5')



In [42]:
# Predict on new sample
sample_text = ["Apple released a new iPhone with advanced camera features"]
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=100)
pred = model.predict(sample_pad)
predicted_label = le.inverse_transform([np.argmax(pred)])

print("Predicted topic:", predicted_label[0])
# AG News has 4 labels --> 0: World, 1: Sports, 2: Business, 3: Sci/Tech.

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step
Predicted topic: 3
