### Objective: Read in Logs to retrieve Top Model Hyperparameters

### Make sure to configure correct run_id

In [33]:
import pandas as pd
import glob
from sklearn.preprocessing import MinMaxScaler
import os

# --- CONFIGURATION ---
run_id = "Run1"  # <--- CHANGE THIS FOR EACH ITERATION

#run_id = "Run2"


base_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/hpc_scripts/Classification/BERTopic/Tobias_TEST/artifact/logs"
log_glob_pattern = f"{base_path}/hyperparameter_grid_search/{run_id}/*.csv"
output_dir = f"{base_path}/best_models/{run_id}"
output_merged_csv = f"{output_dir}/bertopic_grid_log_merged.csv"
output_top_csv = f"{output_dir}/BERTopic_top_models_{run_id}.csv"
top_models_nr = 50

score_weights = {
    "coherence": 0.4,
    "diversity": 0.4,
    "outlier_penalty": 0.2  # Inverse of outlier_pct
}

os.makedirs(output_dir, exist_ok=True)

# --- LOAD ALL LOG FILES ---
log_files = glob.glob(log_glob_pattern)
dfs = [pd.read_csv(f) for f in log_files]
df_all = pd.concat(dfs, ignore_index=True).drop_duplicates()

# --- NORMALIZE METRICS ---
scaler = MinMaxScaler()
df_all["coherence_scaled"] = scaler.fit_transform(df_all[["coherence"]])
df_all["diversity_scaled"] = scaler.fit_transform(df_all[["diversity"]])
df_all["outlier_penalty"] = 1 - scaler.fit_transform(df_all[["outlier_pct"]])

# --- COMPUTE FINAL SCORE ---
df_all["final_score"] = (
    score_weights["coherence"] * df_all["coherence_scaled"] +
    score_weights["diversity"] * df_all["diversity_scaled"] +
    score_weights["outlier_penalty"] * df_all["outlier_penalty"]
)

# --- SELECT TOP MODELS ---
df_top = df_all.sort_values(by="final_score", ascending=False).head(top_models_nr)

# --- SAVE OUTPUT ---
df_all.to_csv(output_merged_csv, index=False)
df_top.to_csv(output_top_csv, index=False)

print(f"Merged log saved to: {output_merged_csv}")
print(f"Top {top_models_nr} models saved to: {output_top_csv}")


Merged log saved to: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/hpc_scripts/Classification/BERTopic/Tobias_TEST/artifact/logs/best_models/Run1/bertopic_grid_log_merged.csv
Top 50 models saved to: /Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/hpc_scripts/Classification/BERTopic/Tobias_TEST/artifact/logs/best_models/Run1/BERTopic_top_models_Run1.csv


In [27]:
#BEST MODEL
csv_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/hpc_scripts/Classification/BERTopic/Tobias_TEST/artifact/logs/best_models/BERTopic_top_models_Run1.csv"

df_top_models = pd.read_csv(csv_path)


In [30]:
df_top_models.head(50)

Unnamed: 0,embedding_model,metric,min_cluster_size,min_samples,nr_topics,umap_neighbors,umap_components,umap_min_dist,n_topics,outliers,outlier_pct,time_sec,coherence,diversity,coherence_scaled,diversity_scaled,outlier_penalty,final_score
0,all-MiniLM-L6-v2,cosine,1000,300,15,15,7,0.0,3,23247,10.18,792.35,0.5896,1.0,1.0,1.0,0.952166,0.990433
1,all-MiniLM-L6-v2,cosine,1000,100,15,15,7,0.0,3,23843,10.44,756.91,0.5896,1.0,1.0,1.0,0.948129,0.989626
2,all-MiniLM-L6-v2,cosine,750,100,15,15,7,0.0,3,23843,10.44,861.1,0.5896,1.0,1.0,1.0,0.948129,0.989626
3,all-MiniLM-L6-v2,cosine,750,200,15,15,7,0.0,3,24409,10.69,877.35,0.5896,1.0,1.0,1.0,0.944246,0.988849
4,all-MiniLM-L6-v2,cosine,1000,200,15,15,7,0.0,3,24409,10.69,742.23,0.5896,1.0,1.0,1.0,0.944246,0.988849
5,all-MiniLM-L6-v2,cosine,750,300,15,15,7,0.0,3,24548,10.75,783.25,0.5896,1.0,1.0,1.0,0.943314,0.988663
6,all-MiniLM-L6-v2,cosine,1000,50,15,20,9,0.0,3,27181,11.91,1373.88,0.5896,1.0,1.0,1.0,0.925299,0.98506
7,all-MiniLM-L6-v2,cosine,500,200,15,15,7,0.0,4,22451,9.84,903.17,0.5105,0.975,0.709084,0.953575,0.957447,0.856553
8,all-MiniLM-L6-v2,cosine,500,300,15,15,7,0.0,4,23259,10.19,909.0,0.5041,0.975,0.685546,0.953575,0.952011,0.846051
9,all-MiniLM-L6-v2,cosine,750,300,15,15,5,0.0,4,20356,8.92,787.03,0.4975,0.975,0.661273,0.953575,0.971735,0.840286
