In [1]:
from data_loader import load_data, preprocess_data
from glove_loader import load_glove_embeddings
from train import train_evaluate_model
import os

In [2]:
## Data path
data_path = '../../core-tech/due_dilligence_data.csv'
glove_embbeding_path = '../../core-tech/glove.840B.300d.txt'
# Directory to save models
model_save_dir = 'topic_models_LSTM_imbalanced'
# Create the directory if it doesn't exist
if not os.path.exists(model_save_dir):
    os.makedirs(model_save_dir)

# load data
df = load_data(data_path)

In [3]:
glove_embeddings = load_glove_embeddings(glove_embbeding_path)

In [4]:
## Add a column word_count
df['word_count'] = df['sentence'].apply(lambda x: len(str(x).split()))

In [5]:
unique_topics = ['1272', ' 1474 ', ' 1238 ', ' 1275 ', ' 1239 ', ' 1520 ', ' 1509 ', ' 1240 ', ' 1308 ', ' 1319 ',
             ' 1439 ', ' 1267 ', ' 1242 ', ' 1462 ', ' 1265 ', ' 1444 ', ' 1312 ', ' 1244 ', ' 1243 ', ' 1468 ',
             ' 1309 ', ' 1524 ', ' 1247 ', ' 1440 ', ' 1251 ', ' 1249 ', ' 1248 ', ' 1262 ', ' 1250 ', ' 1252 ',
             ' 1245 ', ' 1512 ', ' 1498 ', ' 1601 ', ' 1443 ', ' 1086 ', ' 1551 ', ' 1253 ', ' 1320 ', ' 1304 ',
             ' 1469 ', ' 1611 ', ' 1300 ', ' 1489 ', ' 1500 ', ' 1261 ', ' 1318 ', ' 1460 ', ' 1475 ', ' 1321 ']

In [6]:
unique_topics = ['1524']

In [7]:
# Convert string IDs to integers and remove extra spaces
unique_topics = [int(topic.strip()) for topic in unique_topics]

# Number of topics
num_topics = len(unique_topics)

sum_precision = 0
sum_recall = 0
sum_f1_score = 0

# Train and evaluate models for each topic
for topic_id in unique_topics:
    print(topic_id)
    data = df[df['topic_id'] == topic_id]

    print("Preprocessing")
    X, df_filtered, max_length,tokenizer_obj, vocab_size  = preprocess_data(data, glove_embeddings)
    print("Model Loop")
    precision, recall, f1_score_1 = train_evaluate_model(X, df_filtered, max_length, topic_id, model_save_dir, 
                                                       tokenizer_obj, vocab_size, glove_embeddings)
    sum_precision += precision
    sum_recall += recall
    sum_f1_score += f1_score_1


# Calculate average metrics across all topics
avg_precision = sum_precision / num_topics
avg_recall = sum_recall / num_topics
avg_f1_score = sum_f1_score / num_topics

print("Average Metrics Across All Topics:")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-Score: {avg_f1_score:.4f}")

1524
Preprocessing
vocab_size: 16711
max_length: 443
Model Loop
Processing data for topic id: 1524
Model training
2210/2210 - 730s - loss: 0.0272 - accuracy: 0.9948 - val_loss: 0.0143 - val_accuracy: 0.9968 - 730s/epoch - 330ms/step
Model prediction
Topic: 1524
Precision: 0.8571, Recall: 0.5126, F1-Score: 0.6415
118
198
{'TP': 101, 'FP': 17, 'FN': 97, 'Recall': 0.5101010075247424, 'Precision': 0.8559321961361679, 'F1-Score': 0.6392400343297767}
Average Metrics Across All Topics:
Average Precision: 0.8571
Average Recall: 0.5126
Average F1-Score: 0.6415
