In [4]:
import os

import pandas as pd
import papermill as pm
from logger import logger

## Generate Embeddings

In [2]:
notebook = "embeddings.ipynb"
pm.inspect_notebook(notebook)

{'CONTRIBUTOR': {'name': 'CONTRIBUTOR',
  'inferred_type_name': 'str',
  'default': '"Health Promotion Board"',
  'help': ''},
 'CATEGORY': {'name': 'CATEGORY',
  'inferred_type_name': 'str',
  'default': '"live-healthy"',
  'help': ''},
 'MODEL_NAME': {'name': 'MODEL_NAME',
  'inferred_type_name': 'str',
  'default': '"all-MiniLM-L6-v2"',
  'help': ''},
 'POOLING_STRATEGY': {'name': 'POOLING_STRATEGY',
  'inferred_type_name': 'str',
  'default': '"max"',
  'help': ''},
 'OWNER': {'name': 'OWNER',
  'inferred_type_name': 'str',
  'default': '"sentence-transformers"',
  'help': ''}}

In [3]:
# model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"
model_name = "bge-large-en-v1.5"
owner_name = "BAAI"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="embeddings.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_emb.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
        },
    )

2024-06-20 18:31:08,858 - Running embeddings.ipynb for bge-large-en-v1.5 model with mean pooling strategy
  from .autonotebook import tqdm as notebook_tqdm
Executing: 100%|██████████| 18/18 [04:42<00:00, 15.69s/cell]


## Evaluate Similarities

In [4]:
notebook = "similarity.ipynb"
pm.inspect_notebook(notebook)

{'MODEL_NAME': {'name': 'MODEL_NAME',
  'inferred_type_name': 'str',
  'default': '"all-MiniLM-L6-v2"',
  'help': ''},
 'POOLING_STRATEGY': {'name': 'POOLING_STRATEGY',
  'inferred_type_name': 'str',
  'default': '"max"',
  'help': ''},
 'OWNER': {'name': 'OWNER',
  'inferred_type_name': 'str',
  'default': '"sentence-transformers"',
  'help': ''},
 'METHOD': {'name': 'METHOD',
  'inferred_type_name': 'str',
  'default': '"contextual_nn_based_embeddings"',
  'help': ''}}

In [9]:
method = "contextual_nn_based_embeddings"
model_name = "all-MiniLM-L6-v2"
# model_name = "all-mpnet-base-v2"
owner_name = "sentence-transformers"

for strategy in ["mean"]:
    logger.info(
        f"Running {notebook} for {model_name} model with {strategy} pooling strategy"
    )
    pm.execute_notebook(
        input_path="similarity.ipynb",
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{model_name}_{strategy}_sim.ipynb",
        ),
        parameters={
            "MODEL_NAME": model_name,
            "POOLING_STRATEGY": strategy,
            "OWNER": owner_name,
            "METHOD": method,
        },
    )

2024-06-20 18:47:39,995 - Running similarity.ipynb for all-MiniLM-L6-v2 model with mean pooling strategy
Executing: 100%|██████████| 18/18 [00:13<00:00,  1.38cell/s]


### Clustering and Graph Visualisation

In [10]:
notebook = "construct_graph.ipynb"
pm.inspect_notebook(notebook)

{'EMBEDDING_MODEL': {'name': 'EMBEDDING_MODEL',
  'inferred_type_name': 'str',
  'default': '"tfidf"',
  'help': ''}}

In [14]:
variation_list = []

In [None]:
# Include models manually if it doesn't fall under statistical, contextual or sbert embedding method, such as doc2vec, glove embeddings
variation_list = [
    # "d2v",
    # "glove"
]

In [16]:
methods_list = ["test", "sbert_embeddings", "statistical_vector_based_embeddings", "contextual_nn_based_embeddings"]
directory = "../artifacts/outputs/"

for method in methods_list:
    input_file = f"{directory}/{method}_similarity_score.xlsx"
    if os.path.isfile(input_file):
        excel = pd.ExcelFile(input_file)
        print(f"{method}: {excel.sheet_names}")
        for sheet_name in excel.sheet_names:
            df = pd.read_excel(input_file, sheet_name=sheet_name)
            # output_file = f"{directory}/{sheet_name}_similarity_score.csv"
            # df.to_csv(output_file, index=False)
        variation_list.extend(excel.sheet_names)

variation_list

sbert_embeddings: ['stsb-mpnet-base-v2_mean', 'all-mpnet-base-v2_mean']
statistical_vector_based_embeddings: ['bow_cosine', 'bow_euclidean', 'bow_dot', 'bow_manhattan', 'tfidf_cosine', 'tfidf_euclidean', 'tfidf_dot', 'tfidf_manhattan', 'lsa_cosine', 'lsa_euclidean', 'lsa_dot', 'lsa_manhattan', 'lda_cosine', 'lda_euclidean', 'lda_dot', 'lda_manhattan']
contextual_nn_based_embeddings: ['bge-large-en-v1.5_mean', 'all-MiniLM-L6-v2_mean']


['stsb-mpnet-base-v2_mean',
 'all-mpnet-base-v2_mean',
 'bow_cosine',
 'bow_euclidean',
 'bow_dot',
 'bow_manhattan',
 'tfidf_cosine',
 'tfidf_euclidean',
 'tfidf_dot',
 'tfidf_manhattan',
 'lsa_cosine',
 'lsa_euclidean',
 'lsa_dot',
 'lsa_manhattan',
 'lda_cosine',
 'lda_euclidean',
 'lda_dot',
 'lda_manhattan',
 'bge-large-en-v1.5_mean',
 'all-MiniLM-L6-v2_mean',
 'stsb-mpnet-base-v2_mean',
 'all-mpnet-base-v2_mean',
 'bow_cosine',
 'bow_euclidean',
 'bow_dot',
 'bow_manhattan',
 'tfidf_cosine',
 'tfidf_euclidean',
 'tfidf_dot',
 'tfidf_manhattan',
 'lsa_cosine',
 'lsa_euclidean',
 'lsa_dot',
 'lsa_manhattan',
 'lda_cosine',
 'lda_euclidean',
 'lda_dot',
 'lda_manhattan',
 'bge-large-en-v1.5_mean',
 'all-MiniLM-L6-v2_mean']

In [14]:
for em_model in variation_list:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_construct_graph.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )

2024-06-20 19:00:52,746 - Running construct_graph.ipynb for bge-large-en-v1.5_mean model
Executing: 100%|██████████| 21/21 [00:08<00:00,  2.40cell/s]
2024-06-20 19:01:01,536 - Running construct_graph.ipynb for all-MiniLM-L6-v2_mean model
Executing: 100%|██████████| 21/21 [00:08<00:00,  2.60cell/s]


In [15]:
notebook = "clusterviz.ipynb"
pm.inspect_notebook(notebook)

{'EMBEDDING_MODEL': {'name': 'EMBEDDING_MODEL',
  'inferred_type_name': 'None',
  'default': '"tfidf"',
  'help': ''}}

In [16]:
for em_model in variation_list:
    logger.info(f"Running {notebook} for {em_model} model")
    pm.execute_notebook(
        input_path=notebook,
        output_path=os.path.join(
            "..",
            "artifacts",
            "notebooks",
            f"{em_model}_clusterviz.ipynb",
        ),
        parameters={"EMBEDDING_MODEL": em_model},
    )

2024-06-20 19:01:20,175 - Running clusterviz.ipynb for bge-large-en-v1.5_mean model
Executing: 100%|██████████| 15/15 [00:01<00:00,  9.43cell/s]
2024-06-20 19:01:21,773 - Running clusterviz.ipynb for all-MiniLM-L6-v2_mean model
Executing: 100%|██████████| 15/15 [00:01<00:00,  9.76cell/s]
