In [1]:
import sys
import os
from typing import Dict, Tuple
from typing import List
from src.utils import default_data_path, config_to_env, check_config
from src.task import LaMPTask
from src.models import (
    feed_prompt_to_lm,
    feed_prompts_to_lm,
    OpenAIModel,
    task_2_parse_response,
    DistilBERTModel,
    BERTSERINIModel,
    MiniLM,
)

In [2]:
config_to_env("OPENAI_API_KEY")

In [3]:
task_header = "LaMP_2"
store_folder = os.path.join("src", "data", task_header)

OpenAI_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=OpenAIModel(),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "OpenAI.txt"),
    callback=task_2_parse_response,
)
DistilBERT_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=DistilBERTModel(task_name="LaMP_2"),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "DistilBERTModel.txt"),
    callback=task_2_parse_response,
)
BERTSERINI_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=BERTSERINIModel(task_name="LaMP_2"),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "BERTSERINI.txt"),
    callback=task_2_parse_response,
)
MiniLM_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=MiniLM(task_name="LaMP_2"),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "MiniLM.txt"),
    callback=task_2_parse_response,
)

In [4]:
store_dir = os.path.join("src", "data", task_header)
os.makedirs(store_dir, exist_ok=True)
subscribers_full = {
    "BERTSERINI": BERTSERINI_Subscriber,
    "DistilBERT": DistilBERT_Subscriber,
    "MiniLM": MiniLM_Subscriber,
    "OpenAI": OpenAI_Subscriber,
}
preds_save_path_full = {
    "BERTSERINI": os.path.join(
        store_dir,
        f"{task_header}_train_preds_BERTSERINI_with_keyword_{{file_ending}}.json",
    ),
    "DistilBERT": os.path.join(
        store_dir,
        f"{task_header}_train_preds_DistilBERT_with_keyword_{{file_ending}}.json",
    ),
    "MiniLM": os.path.join(
        store_dir,
        f"{task_header}_train_preds_MiniLM_with_keyword_{{file_ending}}.json",
    ),
    "OpenAI": os.path.join(
        store_dir, f"{task_header}_train_preds_OpenAI_with_keyword_{{file_ending}}.json"
    ),
}

task_header = "LaMP_2"
entry_per_category = 16
dataset_question_path = os.path.join("src", "data", "LaMP_2_train_questions.json")

In [5]:
def task_2_with_keywords_query(
    with_keyword_params, subscriber_namelist: List[str], debug: bool = False
):
    worker_count = 8 if not debug else 1
    api_keys = (
        [
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
        ]
        if not debug
        else None
    )

    ready_model = dict()

    for text_rank_top_k_keywords, category_top_k_keywords in with_keyword_params:
        subscribers = dict()
        preds_save_path = dict()
        file_ending = (
            f"{entry_per_category}_{text_rank_top_k_keywords}_{category_top_k_keywords}"
        )

        question_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_prompts_questions_with_keyword_{file_ending}.json",
        )
        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_with_keyword_{file_ending}.json",
        )

        for subscriber_name in subscriber_namelist:
            subscribers[subscriber_name] = subscribers_full[subscriber_name]
            preds_save_path[subscriber_name] = preds_save_path_full[
                subscriber_name
            ].format(file_ending=file_ending)

        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers=subscribers,
            worker_count=worker_count,
            prompt_save_path=question_store_path,
            preds_save_path=preds_save_path,
            keyword_extraction=True,
        )

        ready_model[(text_rank_top_k_keywords, category_top_k_keywords)] = curr_task
        curr_task.load_prompts(question_store_path)
        curr_task.subscribe(
            skip_eval=True,
            api_keys=api_keys,
        )


def task_2_with_keywords_eval(
    with_keyword_params,
    subscriber_namelist: List[str],
    ready_models: Dict[Tuple[int, int], LaMPTask] = None,
):
    if ready_models is not None:
        for (
            text_rank_top_k_keywords,
            category_top_k_keywords,
        ), curr_task in ready_models.items():
            curr_task.evaluate()
        return

    store_dir = os.path.join("src", "data", task_header)

    evaluated_result = dict()

    for text_rank_top_k_keywords, category_top_k_keywords in with_keyword_params:
        subscribers = dict()
        preds_save_path = dict()
        file_ending = (
            f"{entry_per_category}_{text_rank_top_k_keywords}_{category_top_k_keywords}"
        )

        for subscriber_name in subscriber_namelist:
            subscribers[subscriber_name] = subscribers_full[subscriber_name]
            preds_save_path[subscriber_name] = preds_save_path_full[
                subscriber_name
            ].format(file_ending=file_ending)

        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_with_keyword_{file_ending}.json",
        )
        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers=subscribers,
        )
        curr_task.evaluate(preds_save_name=preds_save_path)
        print(curr_task.score)
        evaluated_result[
            f"{text_rank_top_k_keywords} {category_top_k_keywords}"
        ] = curr_task.score
    return evaluated_result

In [6]:
with_keyword_params = [(5, 15), (10, 30)]
debug = False

In [7]:
with_keyword_params = [(5, 15)]
# with_keyword_params = [(10, 30)]
subscribers = [
    "BERTSERINI",
    # "DistilBERT",
    # "MiniLM",
    # "OpenAI"
]

# debug=True

In [9]:
ready_models = task_2_with_keywords_query(with_keyword_params, subscribers, debug=debug)

Output()

In [13]:
evaluate_results = task_2_with_keywords_eval(with_keyword_params, subscribers)

{'MiniLM': {'accuracy': 0.25862068965517243, 'f1': 0.23614605134499464}}


In [14]:
def task_2_without_keywords_query(
    without_keyword_params, subscriber_namelist: List[str], debug: bool = False
):
    worker_count = 8 if not debug else 1
    api_keys = (
        [
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
        ]
        if not debug
        else None
    )

    ready_model = dict()

    for bm25_top_k in without_keyword_params:
        subscribers = dict()
        preds_save_path = dict()
        file_ending = f"{entry_per_category}_{bm25_top_k}"

        question_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_prompts_questions_without_keyword_{file_ending}.json",
        )
        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_without_keyword_{file_ending}.json",
        )

        for subscriber_name in subscriber_namelist:
            subscribers[subscriber_name] = subscribers_full[subscriber_name]
            preds_save_path[subscriber_name] = preds_save_path_full[
                subscriber_name
            ].format(file_ending=file_ending)

        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers=subscribers,
            worker_count=worker_count,
            prompt_save_path=question_store_path,
            preds_save_path=preds_save_path,
            keyword_extraction=False,
        )

        ready_model[bm25_top_k] = curr_task
        curr_task.load_prompts(question_store_path)
        curr_task.subscribe(
            skip_eval=True,
            api_keys=api_keys,
        )


def task_2_without_keywords_eval(
    without_keyword_params,
    subscriber_namelist: List[str],
    ready_models: Dict[Tuple[int, int], LaMPTask] = None,
):
    if ready_models is not None:
        for bm25_top_k, curr_task in ready_models.items():
            curr_task.evaluate()
        return

    evaluated_result = dict()

    for bm25_top_k in without_keyword_params:
        subscribers = dict()
        preds_save_path = dict()
        file_ending = f"{entry_per_category}_{bm25_top_k}"
        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_without_keyword_{file_ending}.json",
        )

        for subscriber_name in subscriber_namelist:
            subscribers[subscriber_name] = subscribers_full[subscriber_name]
            preds_save_path[subscriber_name] = preds_save_path_full[
                subscriber_name
            ].format(file_ending=file_ending)

        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers=subscribers,
        )
        curr_task.evaluate(preds_save_name=preds_save_path)
        print(curr_task.score)
        evaluated_result[f"{bm25_top_k}"] = curr_task.score
    return evaluated_result

In [None]:
preds_save_path_full = {
    "BERTSERINI": os.path.join(
        store_dir,
        f"{task_header}_train_preds_BERTSERINI_without_keyword_{{file_ending}}.json",
    ),
    "DistilBERT": os.path.join(
        store_dir,
        f"{task_header}_train_preds_DistilBERT_without_keyword_{{file_ending}}.json",
    ),
    "MiniLM": os.path.join(
        store_dir,
        f"{task_header}_train_preds_MiniLM_without_keyword_{{file_ending}}.json",
    ),
    "OpenAI": os.path.join(
        store_dir,
        f"{task_header}_train_preds_OpenAI_without_keyword_{{file_ending}}.json",
    ),
}

In [15]:
debug = False
without_keyword_params = [2, 4]
# without_keyword_params = [4]
subscribers = [
    # "BERTSERINI",
    # "DistilBERT",
    "MiniLM",
    # "OpenAI"
]

In [18]:
ready_models_no_keywords = task_2_without_keywords_query(
    without_keyword_params, subscribers, debug=debug
)

Output()

Output()

In [19]:
evaluate_results_no_keywords = task_2_without_keywords_eval(
    without_keyword_params, subscribers
)

{'MiniLM': {'accuracy': 0.2511013215859031, 'f1': 0.23674196174196174}}
{'MiniLM': {'accuracy': 0.3276595744680851, 'f1': 0.29632111746810086}}
