In [9]:
import os
import papermill as pm
from logger import logger

## Generate Embeddings

In [10]:
notebook = "embeddings.ipynb"
pm.inspect_notebook(notebook)

{'CONTRIBUTOR': {'name': 'CONTRIBUTOR',
  'inferred_type_name': 'str',
  'default': '"Health Promotion Board"',
  'help': ''},
 'CATEGORY': {'name': 'CATEGORY',
  'inferred_type_name': 'str',
  'default': '"live-healthy"',
  'help': ''},
 'MODEL_NAME': {'name': 'MODEL_NAME',
  'inferred_type_name': 'str',
  'default': '"all-MiniLM-L6-v2"',
  'help': ''},
 'POOLING_STRATEGY': {'name': 'POOLING_STRATEGY',
  'inferred_type_name': 'str',
  'default': '"max"',
  'help': ''},
 'OWNER': {'name': 'OWNER',
  'inferred_type_name': 'str',
  'default': '"sentence-transformers"',
  'help': ''}}

In [11]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"
model_name = "bge-large-en-v1.5"
owner_name = 'BAAI'

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="embeddings.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f'{model_name}_{strategy}_emb.ipynb',
        ),
        parameters={"MODEL_NAME": model_name, "POOLING_STRATEGY": strategy, "OWNER":owner_name},
    )

2024-06-13 10:04:31,041 - Running embeddings.ipynb for bge-large-en-v1.5 model with mean pooling strategy
Executing: 100%|██████████| 18/18 [04:40<00:00, 15.60s/cell]


## Evaluate Similarities

In [14]:
notebook = "similarity.ipynb"
pm.inspect_notebook(notebook)

{'MODEL_NAME': {'name': 'MODEL_NAME',
  'inferred_type_name': 'str',
  'default': '"all-MiniLM-L6-v2"',
  'help': ''},
 'POOLING_STRATEGY': {'name': 'POOLING_STRATEGY',
  'inferred_type_name': 'str',
  'default': '"max"',
  'help': ''},
 'OWNER': {'name': 'OWNER',
  'inferred_type_name': 'str',
  'default': '"sentence-transformers"',
  'help': ''}}

In [15]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="similarity.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_sim.ipynb",
        ),
        parameters={"MODEL_NAME": model_name, "POOLING_STRATEGY": strategy,"OWNER":owner_name},
    )

2024-06-13 10:13:10,835 - Running similarity.ipynb for bge-large-en-v1.5 model with mean pooling strategy
Executing: 100%|██████████| 19/19 [00:15<00:00,  1.20cell/s]
