In [4]:
import os
import papermill as pm
from logger import logger

## Generate Embeddings

In [5]:
notebook = "embeddings.ipynb"
pm.inspect_notebook(notebook)

{'CONTRIBUTOR': {'name': 'CONTRIBUTOR',
  'inferred_type_name': 'str',
  'default': '"Health Promotion Board"',
  'help': ''},
 'CATEGORY': {'name': 'CATEGORY',
  'inferred_type_name': 'str',
  'default': '"live-healthy"',
  'help': ''},
 'MODEL_NAME': {'name': 'MODEL_NAME',
  'inferred_type_name': 'str',
  'default': '"all-MiniLM-L6-v2"',
  'help': ''},
 'POOLING_STRATEGY': {'name': 'POOLING_STRATEGY',
  'inferred_type_name': 'str',
  'default': '"max"',
  'help': ''},
 'OWNER': {'name': 'OWNER',
  'inferred_type_name': 'str',
  'default': '"sentence-transformers"',
  'help': ''}}

In [None]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"
model_name = "bge-large-en-v1.5"
owner_name = "BAAI"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="embeddings.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_emb.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
        },
    )

## Evaluate Similarities

In [None]:
notebook = "similarity.ipynb"
pm.inspect_notebook(notebook)

In [None]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="similarity.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_sim.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
        },
    )

### Clustering and Graph Visualisation

In [27]:
notebook = "construct_graph.ipynb"
pm.inspect_notebook(notebook)

{'EMBEDDING_MODEL': {'name': 'EMBEDDING_MODEL',
  'inferred_type_name': 'str',
  'default': "'tfidf'",
  'help': ''}}

In [28]:
for em_model in [
    "tfidf",
    "mxbai-embed-large-v1",
    "bge-large-en-v1.5-quant",
    "bge-large-en-v1.5",
]:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_construct_graph.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )

2024-06-14 11:27:04,485 - Running construct_graph.ipynb for tfidf model
Executing: 100%|██████████| 18/18 [00:09<00:00,  1.84cell/s]
2024-06-14 11:27:14,303 - Running construct_graph.ipynb for mxbai-embed-large-v1 model
Executing: 100%|██████████| 18/18 [00:09<00:00,  1.91cell/s]
2024-06-14 11:27:23,786 - Running construct_graph.ipynb for bge-large-en-v1.5-quant model
Executing: 100%|██████████| 18/18 [00:12<00:00,  1.47cell/s]
2024-06-14 11:27:36,128 - Running construct_graph.ipynb for bge-large-en-v1.5 model
Executing: 100%|██████████| 18/18 [00:12<00:00,  1.41cell/s]


In [29]:
notebook = "clusterviz.ipynb"
pm.inspect_notebook(notebook)

{'EMBEDDING_MODEL': {'name': 'EMBEDDING_MODEL',
  'inferred_type_name': 'None',
  'default': "'tfidf'",
  'help': ''}}

In [31]:
for em_model in [
    "tfidf",
    "mxbai-embed-large-v1",
    "bge-large-en-v1.5-quant",
    "bge-large-en-v1.5",
]:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_clusterviz.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )

2024-06-14 11:28:58,876 - Running clusterviz.ipynb for tfidf model
Executing: 100%|██████████| 15/15 [00:03<00:00,  4.76cell/s]
2024-06-14 11:29:02,047 - Running clusterviz.ipynb for mxbai-embed-large-v1 model
Executing: 100%|██████████| 15/15 [00:02<00:00,  5.70cell/s]
2024-06-14 11:29:04,681 - Running clusterviz.ipynb for bge-large-en-v1.5-quant model
Executing: 100%|██████████| 15/15 [00:03<00:00,  4.38cell/s]
2024-06-14 11:29:08,132 - Running clusterviz.ipynb for bge-large-en-v1.5 model
Executing: 100%|██████████| 15/15 [00:02<00:00,  5.89cell/s]
