In [None]:
import os

import pandas as pd
import papermill as pm
from logger import logger

## Generate Embeddings

In [None]:
notebook = "embeddings.ipynb"
pm.inspect_notebook(notebook)

In [None]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"
model_name = "bge-large-en-v1.5"
owner_name = "BAAI"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="embeddings.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_emb.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
        },
    )

## Evaluate Similarities

In [None]:
notebook = "similarity.ipynb"
pm.inspect_notebook(notebook)

In [None]:
method = "contextual_nn_based_embeddings"
model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"
owner_name = "sentence-transformers"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="similarity.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_sim.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
            "METHOD": method,
        },
    )

### Clustering and Graph Visualisation

In [None]:
notebook = "construct_graph.ipynb"
pm.inspect_notebook(notebook)

In [None]:
variation_list = []

In [None]:
# Include models manually if it doesn't fall under statistical, contextual or sbert embedding method, such as doc2vec, glove embeddings
variation_list = [
    # "d2v",
    # "glove"
]

In [None]:
methods_list = [
    "sbert_embeddings",
    "statistical_vector_based_embeddings",
    "contextual_nn_based_embeddings",
]
directory = "../artifacts/outputs/"

for method in methods_list:
    input_file = f"{directory}/{method}_similarity_score.xlsx"
    if os.path.isfile(input_file):
        excel = pd.ExcelFile(input_file)
        print(f"{method}: {excel.sheet_names}")
        for sheet_name in excel.sheet_names:
            df = pd.read_excel(input_file, sheet_name=sheet_name)
            output_file = f"{directory}/{sheet_name}_similarity_score.csv"
            df.to_csv(output_file, index=False)
        variation_list.extend(excel.sheet_names)

variation_list

In [None]:
for em_model in variation_list:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_construct_graph.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )

In [None]:
notebook = "clusterviz.ipynb"
pm.inspect_notebook(notebook)

In [None]:
for em_model in variation_list:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_clusterviz.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )