In [7]:
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from rouge_score import rouge_scorer

In [9]:
topic=pd.read_csv('topic_detection.csv')
topic

Unnamed: 0,text,label,Student_ID
0,"Genetics is the study of genes, genetic variat...",Science,1
1,Algebra involves symbols and the rules for man...,Mathematics,2
2,Artificial Intelligence (AI) is the intelligen...,Computer Science,3
3,The study of maps involves understanding proje...,Geography,4
4,The Internet is a global system of interconnec...,Computer Science,5
...,...,...,...
4995,The Pythagorean theorem states that in a right...,Mathematics,4996
4996,The Renaissance was a period in European histo...,History,4997
4997,"Photography is the art, application, and pract...",Art,4998
4998,The epic poem 'The Odyssey' by Homer details O...,Literature,4999


In [10]:
label_encoder=LabelEncoder()
int_encode=label_encoder.fit_transform(topic['label'])
y= to_categorical(int_encode)

In [11]:
vocab_size = 5000 # Max number of words to keep, based on word frequency
embedding_dim = 100 # Dimension of the dense embedding
maxlen = 100

In [12]:
tokenizer=Tokenizer(num_words=vocab_size,oov_token="<unk>")
tokenizer.fit_on_texts(topic['text'])

In [13]:
sequence=tokenizer.texts_to_sequences(topic['text'])

In [14]:
padded_sequences = pad_sequences(sequence, maxlen=maxlen, padding='post', truncating='post')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
model= Sequential([
    Embedding(vocab_size,embedding_dim,input_length=maxlen),
    Bidirectional(LSTM(128)),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])



In [17]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
history=model.fit(X_train,y_train, epochs=20, batch_size=32, validation_split=0.1, verbose=1)
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nModel Test Accuracy: {accuracy*100:.2f}%")

None
Epoch 1/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 108ms/step - accuracy: 0.5191 - loss: 1.4332 - val_accuracy: 1.0000 - val_loss: 0.0095
Epoch 2/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 99ms/step - accuracy: 0.9999 - loss: 0.0129 - val_accuracy: 1.0000 - val_loss: 0.0111
Epoch 3/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 95ms/step - accuracy: 0.9878 - loss: 0.0658 - val_accuracy: 1.0000 - val_loss: 0.0019
Epoch 4/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 98ms/step - accuracy: 1.0000 - loss: 0.0038 - val_accuracy: 1.0000 - val_loss: 8.0281e-04
Epoch 5/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 97ms/step - accuracy: 1.0000 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 5.1496e-04
Epoch 6/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 97ms/step - accuracy: 1.0000 - loss: 0.0013 - val_accuracy: 1.0000 - val_loss: 3.4240e-0

In [18]:
# Make predictions on the test set
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1) # Get the index of the highest probability
y_true = np.argmax(y_test, axis=1) # Get the true class indices

# Decode numerical predictions back to original labels
predicted_labels = label_encoder.inverse_transform(y_pred)
true_labels = label_encoder.inverse_transform(y_true)

# Print classification report for detailed performance metrics (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_))


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step

Classification Report:
                  precision    recall  f1-score   support

             Art       1.00      1.00      1.00       142
Computer Science       1.00      1.00      1.00       142
       Geography       1.00      1.00      1.00       141
         History       1.00      1.00      1.00       154
      Literature       1.00      1.00      1.00       137
     Mathematics       1.00      1.00      1.00       145
         Science       1.00      1.00      1.00       139

        accuracy                           1.00      1000
       macro avg       1.00      1.00      1.00      1000
    weighted avg       1.00      1.00      1.00      1000



In [19]:
# --- 6. Prediction on New Text ---

def predict_topic(text_input, model, tokenizer, label_encoder, maxlen):
    """
    Predicts the topic of a new text input using the trained model.
    """
    # Convert new text to sequence
    new_sequence = tokenizer.texts_to_sequences([text_input])
    # Pad the sequence
    new_padded_sequence = pad_sequences(new_sequence, maxlen=maxlen, padding='post', truncating='post')

    # Make prediction
    prediction_probs = model.predict(new_padded_sequence)
    # Get the index of the highest probability
    predicted_class_index = np.argmax(prediction_probs, axis=1)[0]
    # Decode the index back to the original label
    predicted_label = label_encoder.inverse_transform([predicted_class_index])[0]

    return predicted_label, prediction_probs[0]


In [20]:
new_student_answer_1 = "The concept of supply and demand determines market prices. When demand exceeds supply, prices tend to rise, and vice versa. This is a fundamental principle in economics."

In [21]:
print("\n--- Predictions on New Texts ---")
topic1, probs1 = predict_topic(new_student_answer_1, model, tokenizer, label_encoder, maxlen)
print(f"Text: '{new_student_answer_1}'")
print(f"Predicted Topic: {topic1}")


--- Predictions on New Texts ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
Text: 'The concept of supply and demand determines market prices. When demand exceeds supply, prices tend to rise, and vice versa. This is a fundamental principle in economics.'
Predicted Topic: Mathematics


In [22]:
# --- Configuration for model saving ---
MODEL_DIR = "trained_model"
MODEL_PATH = os.path.join(MODEL_DIR, "topic_classifier_model.h5")
TOKENIZER_PATH = os.path.join(MODEL_DIR, "tokenizer.pkl")
LABEL_ENCODER_PATH = os.path.join(MODEL_DIR, "label_encoder.pkl")
MAXLEN_PATH = os.path.join(MODEL_DIR, "maxlen.txt") # To save maxlen value