In [1]:
import sys
import os
from typing import Dict, Tuple
from src.utils import default_data_path, config_to_env, check_config
from src.task import LaMPTask
from src.models import (
    feed_prompt_to_lm,
    feed_prompts_to_lm,
    OpenAIModel,
    task_1_parse_response,
    DistilBERTModel,
    BERTSERINIModel,
    MiniLM,
)

In [2]:
config_to_env("OPENAI_API_KEY")

In [3]:
task_header = "LaMP_1"
store_folder = os.path.join("src", "data", task_header)

OpenAI_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=OpenAIModel(),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "OpenAI.txt"),
    callback=task_1_parse_response,
)
DistilBERT_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=DistilBERTModel(task_name="LaMP_1"),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "DistilBERTModel.txt"),
    callback=task_1_parse_response,
)
BERTSERINI_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=BERTSERINIModel(task_name="LaMP_1"),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "BERTSERINI.txt"),
    callback=task_1_parse_response,
)
MiniLM_Subscriber = lambda id, prompt, api_key: feed_prompt_to_lm(
    model=MiniLM(task_name="LaMP_1"),
    id=id,
    prompt=prompt,
    api_key=api_key,
    log_path=os.path.join(store_folder, "MiniLM.txt"),
    callback=task_1_parse_response,
)

In [4]:
def task_1_with_keywords_query(with_keyword_params, debug: bool = False):
    task_header = "LaMP_1"
    store_dir = os.path.join("src", "data", task_header)
    os.makedirs(store_dir, exist_ok=True)
    dataset_question_path = os.path.join("src", "data", "LaMP_1_train_questions.json")
    entry_per_category = 120
    worker_count = 8 if not debug else 1
    api_keys = (
        [
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
        ]
        if not debug
        else None
    )

    ready_model = dict()

    for text_rank_top_k_keywords, bm25_top_k in with_keyword_params:
        file_ending = f"{entry_per_category}_{text_rank_top_k_keywords}_{bm25_top_k}"

        question_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_prompts_questions_with_keyword_{file_ending}.json",
        )
        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_with_keyword_{file_ending}.json",
        )
        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers={
                # "BERTSERINI": BERTSERINI_Subscriber,
                # "DistilBERT": DistilBERT_Subscriber,
                # "MiniLM": MiniLM_Subscriber,
                "OpenAI": OpenAI_Subscriber,
            },
            worker_count=worker_count,
            prompt_save_path=question_store_path,
            preds_save_path={
                # "BERTSERINI": os.path.join(store_dir,f"{task_header}_train_preds_BERTSERINI_with_keyword_{file_ending}.json"),
                # "DistilBERT": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_DistilBERT_with_keyword_{file_ending}.json",
                # ),
                # "MiniLM": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_MiniLM_with_keyword_{file_ending}.json",
                # ),
                "OpenAI": os.path.join(
                    store_dir,
                    f"{task_header}_train_preds_OpenAI_with_keyword_{file_ending}.json",
                ),
            },
            keyword_extraction=True,
        )

        ready_model[(text_rank_top_k_keywords, bm25_top_k)] = curr_task
        curr_task.load_prompts(question_store_path)
        curr_task.subscribe(
            skip_eval=True,
            api_keys=api_keys,
        )


def task_1_with_keywords_eval(
    with_keyword_params, ready_models: Dict[Tuple[int, int], LaMPTask] = None
):
    if ready_models is not None:
        for (text_rank_top_k_keywords, bm25_top_k), curr_task in ready_models.items():
            curr_task.evaluate()
        return

    task_header = "LaMP_1"
    store_dir = os.path.join("src", "data", task_header)
    dataset_question_path = os.path.join("src", "data", "LaMP_1_train_questions.json")
    entry_per_category = 120

    evaluated_result = dict()

    for text_rank_top_k_keywords, bm25_top_k in with_keyword_params:
        file_ending = f"{entry_per_category}_{text_rank_top_k_keywords}_{bm25_top_k}"
        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_with_keyword_{file_ending}.json",
        )
        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers={
                # "BERTSERINI": BERTSERINI_Subscriber,
                # "DistilBERT": DistilBERT_Subscriber,
                # "MiniLM": MiniLM_Subscriber,
                "OpenAI": OpenAI_Subscriber,
            },
        )
        curr_task.evaluate(
            preds_save_name={
                # "BERTSERINI": os.path.join(store_dir,f"{task_header}_train_preds_BERTSERINI_with_keyword_{file_ending}.json"),
                # "DistilBERT": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_DistilBERT_with_keyword_{file_ending}.json",
                # ),
                # "MiniLM": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_MiniLM_with_keyword_{file_ending}.json",
                # ),
                "OpenAI": os.path.join(
                    store_dir,
                    f"{task_header}_train_preds_OpenAI_with_keyword_{file_ending}.json",
                ),
            }
        )
        print(curr_task.score)
        evaluated_result[f"{text_rank_top_k_keywords} {bm25_top_k}"] = curr_task.score
    return evaluated_result

In [5]:
with_keyword_params = [(5, 5), (10, 10)]
debug = False

In [6]:
# with_keyword_params = [(5, 5)]
with_keyword_params = [(10, 10)]
# debug=True

In [7]:
ready_models = task_1_with_keywords_query(with_keyword_params, debug=debug)

Output()

In [8]:
evaluate_results = task_1_with_keywords_eval(with_keyword_params)

{'OpenAI': {'accuracy': 0.5063291139240507, 'f1': 0.5218125960061444}}


In [21]:
def task_1_without_keywords_query(without_keyword_params, debug: bool = False):
    task_header = "LaMP_1"
    store_dir = os.path.join("src", "data", task_header)
    os.makedirs(store_dir, exist_ok=True)
    dataset_question_path = os.path.join("src", "data", "LaMP_1_train_questions.json")
    entry_per_category = 120
    worker_count = 8 if not debug else 1
    api_keys = (
        [
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
            check_config("HUGGING_FACE_KEY_1"),
            check_config("HUGGING_FACE_KEY_2"),
            check_config("HUGGING_FACE_KEY_3"),
            check_config("HUGGING_FACE_KEY_4"),
        ]
        if not debug
        else None
    )

    ready_model = dict()

    for bm25_top_k in without_keyword_params:
        file_ending = f"{entry_per_category}_{bm25_top_k}"

        question_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_prompts_questions_without_keyword_{file_ending}.json",
        )
        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_without_keyword_{file_ending}.json",
        )
        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers={
                "BERTSERINI": BERTSERINI_Subscriber,
                # "DistilBERT": DistilBERT_Subscriber,
                # "MiniLM": MiniLM_Subscriber,
                # "OpenAI": OpenAI_Subscriber,
            },
            worker_count=worker_count,
            prompt_save_path=question_store_path,
            preds_save_path={
                "BERTSERINI": os.path.join(
                    store_dir,
                    f"{task_header}_train_preds_BERTSERINI_without_keyword_{file_ending}.json",
                ),
                # "DistilBERT": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_DistilBERT_without_keyword_{file_ending}.json",
                # ),
                # "MiniLM": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_MiniLM_without_keyword_{file_ending}.json",
                # ),
                # "OpenAI": os.path.join(store_dir,f"{task_header}_train_preds_OpenAI_without_keyword_{file_ending}.json"),
            },
            keyword_extraction=False,
        )

        ready_model[bm25_top_k] = curr_task
        curr_task.load_prompts(question_store_path)
        curr_task.subscribe(
            skip_eval=True,
            api_keys=api_keys,
        )


def task_1_without_keywords_eval(
    without_keyword_params, ready_models: Dict[Tuple[int, int], LaMPTask] = None
):
    if ready_models is not None:
        for bm25_top_k, curr_task in ready_models.items():
            curr_task.evaluate()
        return

    task_header = "LaMP_1"
    store_dir = os.path.join("src", "data", task_header)
    dataset_question_path = os.path.join("src", "data", "LaMP_1_train_questions.json")
    entry_per_category = 120

    evaluated_result = dict()

    for bm25_top_k in without_keyword_params:
        file_ending = f"{entry_per_category}_{bm25_top_k}"
        output_store_path = os.path.join(
            store_dir,
            f"{task_header}_train_outputs_selected_without_keyword_{file_ending}.json",
        )
        curr_task = LaMPTask(
            task_question_file=dataset_question_path,
            task_output_file=output_store_path,
            subscribers={
                "BERTSERINI": BERTSERINI_Subscriber,
                # "DistilBERT": DistilBERT_Subscriber,
                # "MiniLM": MiniLM_Subscriber,
                # "OpenAI": OpenAI_Subscriber,
            },
        )
        curr_task.evaluate(
            preds_save_name={
                "BERTSERINI": os.path.join(
                    store_dir,
                    f"{task_header}_train_preds_BERTSERINI_without_keyword_{file_ending}.json",
                ),
                # "DistilBERT": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_DistilBERT_without_keyword_{file_ending}.json",
                # ),
                # "MiniLM": os.path.join(
                #     store_dir,
                #     f"{task_header}_train_preds_MiniLM_without_keyword_{file_ending}.json",
                # ),
                # "OpenAI": os.path.join(store_dir,f"{task_header}_train_preds_OpenAI_without_keyword_{file_ending}.json"),
            }
        )
        print(curr_task.score)
        evaluated_result[f"{bm25_top_k}"] = curr_task.score
    return evaluated_result

In [28]:
debug = False
without_keyword_params = [2, 4]
without_keyword_params = [4]

In [30]:
ready_models_no_keywords = task_1_without_keywords_query(
    without_keyword_params, debug=debug
)

Output()

In [31]:
evaluate_results_no_keywords = task_1_without_keywords_eval(without_keyword_params)

{'BERTSERINI': {'accuracy': 0.3191489361702128, 'f1': 0.2987044882421563}}
