In [1]:
import os

import papermill as pm
from logger import logger

import pandas as pd

## Generate Embeddings

In [None]:
notebook = "embeddings.ipynb"
pm.inspect_notebook(notebook)

In [None]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"
model_name = "bge-large-en-v1.5"
owner_name = "BAAI"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="embeddings.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_emb.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
        },
    )

## Evaluate Similarities

In [None]:
notebook = "similarity.ipynb"
pm.inspect_notebook(notebook)

In [None]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="similarity.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_sim.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
        },
    )

### Clustering and Graph Visualisation

In [2]:
notebook = "construct_graph.ipynb"
pm.inspect_notebook(notebook)

{'EMBEDDING_MODEL': {'name': 'EMBEDDING_MODEL',
  'inferred_type_name': 'str',
  'default': '"tfidf"',
  'help': ''}}

In [3]:
directory = "../artifacts/outputs/"

input_file = f"{directory}/statistical_vector_based_embeddings_similarity_scores.xlsx"
excel = pd.ExcelFile(input_file)

for sheet_name in excel.sheet_names:
    df = pd.read_excel(input_file, sheet_name=sheet_name)
    output_file = f"{directory}/{sheet_name}_mean_similarity_score.csv"
    df.to_csv(output_file, index=False)

In [4]:
variation_list = [
    "tfidf",
    "mxbai-embed-large-v1",
    "bge-large-en-v1.5-quant",
    "bge-large-en-v1.5",
    "d2v",
]

variation_list.extend(excel.sheet_names)
variation_list

['tfidf',
 'mxbai-embed-large-v1',
 'bge-large-en-v1.5-quant',
 'bge-large-en-v1.5',
 'd2v',
 'bow_cosine',
 'bow_euclidean',
 'bow_dot',
 'bow_manhattan',
 'tfidf_cosine',
 'tfidf_euclidean',
 'tfidf_dot',
 'tfidf_manhattan',
 'lsa_cosine',
 'lsa_euclidean',
 'lsa_dot',
 'lsa_manhattan',
 'lda_cosine',
 'lda_euclidean',
 'lda_dot',
 'lda_manhattan']

In [5]:
for em_model in variation_list:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_construct_graph.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )

2024-06-20 14:08:33,655 - Running construct_graph.ipynb for tfidf model
  from .autonotebook import tqdm as notebook_tqdm
Executing: 100%|██████████| 21/21 [00:11<00:00,  1.83cell/s]
2024-06-20 14:08:45,531 - Running construct_graph.ipynb for mxbai-embed-large-v1 model
Executing: 100%|██████████| 21/21 [00:10<00:00,  1.97cell/s]
2024-06-20 14:08:56,253 - Running construct_graph.ipynb for bge-large-en-v1.5-quant model
Executing: 100%|██████████| 21/21 [00:11<00:00,  1.87cell/s]
2024-06-20 14:09:07,550 - Running construct_graph.ipynb for bge-large-en-v1.5 model
Executing: 100%|██████████| 21/21 [00:17<00:00,  1.18cell/s]
2024-06-20 14:09:25,546 - Running construct_graph.ipynb for d2v model
Executing: 100%|██████████| 21/21 [00:11<00:00,  1.77cell/s]
2024-06-20 14:09:37,466 - Running construct_graph.ipynb for bow_cosine model
Executing: 100%|██████████| 21/21 [00:18<00:00,  1.13cell/s]
2024-06-20 14:09:56,164 - Running construct_graph.ipynb for bow_euclidean model
Executing: 100%|████████

In [8]:
notebook = "clusterviz.ipynb"
pm.inspect_notebook(notebook)

{'EMBEDDING_MODEL': {'name': 'EMBEDDING_MODEL',
  'inferred_type_name': 'None',
  'default': '"tfidf"',
  'help': ''}}

In [9]:
for em_model in variation_list:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_clusterviz.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )

2024-06-20 14:18:17,265 - Running clusterviz.ipynb for tfidf model
Executing: 100%|██████████| 15/15 [00:02<00:00,  5.32cell/s]
2024-06-20 14:18:20,096 - Running clusterviz.ipynb for mxbai-embed-large-v1 model
Executing: 100%|██████████| 15/15 [00:02<00:00,  6.36cell/s]
2024-06-20 14:18:22,455 - Running clusterviz.ipynb for bge-large-en-v1.5-quant model
Executing: 100%|██████████| 15/15 [00:02<00:00,  5.19cell/s]
2024-06-20 14:18:25,349 - Running clusterviz.ipynb for bge-large-en-v1.5 model
Executing: 100%|██████████| 15/15 [00:02<00:00,  6.50cell/s]
2024-06-20 14:18:27,658 - Running clusterviz.ipynb for d2v model
Executing: 100%|██████████| 15/15 [00:02<00:00,  6.59cell/s]
2024-06-20 14:18:29,933 - Running clusterviz.ipynb for bow_cosine model
Executing: 100%|██████████| 15/15 [00:02<00:00,  6.29cell/s]
2024-06-20 14:18:32,320 - Running clusterviz.ipynb for bow_euclidean model
Executing: 100%|██████████| 15/15 [00:04<00:00,  3.48cell/s]
2024-06-20 14:18:36,651 - Running clusterviz.ipy