In [26]:
import json
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense
import os

In [27]:
folder_path = r"C:\Users\hp\OneDrive\Desktop\NLP_3\data"

# Construct the file paths
train_file_path = os.path.join(folder_path, "train_data.json")
test_file_path = os.path.join(folder_path, "test_data.json")

# Read the training data
with open(train_file_path, 'r', encoding='utf-8') as train_file:
    train_data = json.load(train_file)

# Read the test data
with open(test_file_path, 'r', encoding='utf-8') as test_file:
    test_data = json.load(test_file)

In [28]:
def preprocess_data(data):
    texts = []
    labels = []
    
    for class_name, class_docs in data.items():
        for doc_name, doc_content in class_docs.items():
            texts.append(doc_content)
            labels.append(class_name)
    
    return texts, labels

# Preprocess train and test data
train_texts, train_labels = preprocess_data(train_data)
test_texts, test_labels = preprocess_data(test_data)


In [29]:
# Tokenization and Padding for training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

sequences = tokenizer.texts_to_sequences(train_texts)
max_sequence_length = max(len(seq) for seq in sequences)  # Maximum sequence length
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Label Encoding for training data
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(train_labels)
num_classes = len(label_encoder.classes_)


In [30]:
# Tokenization and Padding for test data
test_sequences = tokenizer.texts_to_sequences(test_texts)  # Use tokenizer from training data
test_sequences_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Label Encoding for test data
encoded_test_labels = label_encoder.transform(test_labels)

In [34]:
# Model Architecture
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit(sequences_padded, encoded_labels, batch_size=32, epochs=10, validation_split=0.1)

Epoch 1/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 949ms/step - accuracy: 0.3008 - loss: 3.0473 - val_accuracy: 0.2688 - val_loss: 4.3249
Epoch 2/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 927ms/step - accuracy: 0.4195 - loss: 2.2945 - val_accuracy: 0.1760 - val_loss: 4.0823
Epoch 3/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 946ms/step - accuracy: 0.5233 - loss: 1.8469 - val_accuracy: 0.2268 - val_loss: 4.6101
Epoch 4/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 1s/step - accuracy: 0.5866 - loss: 1.5829 - val_accuracy: 0.3730 - val_loss: 4.3519
Epoch 5/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 1s/step - accuracy: 0.6301 - loss: 1.3920 - val_accuracy: 0.4834 - val_loss: 4.3114
Epoch 6/10
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 1s/step - accuracy: 0.6746 - loss: 1.1878 - val_accuracy: 0.4790 - val_loss: 4.5137
Epoch 7/10
[

<keras.src.callbacks.history.History at 0x22036c1dc90>

In [35]:
from sklearn.metrics import accuracy_score, f1_score

# Predictions on test data
predictions = model.predict(test_sequences_padded)
predicted_labels = np.argmax(predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(encoded_test_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

# Calculate macro-averaged F1-score
f1 = f1_score(encoded_test_labels, predicted_labels, average='macro')
print(f'Macro-averaged F1-score: {f1}')


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 504ms/step
Accuracy: 0.6486083499005965
Macro-averaged F1-score: 0.099881785919363
