# Import

In [35]:
from dotenv import load_dotenv
import os
import mlflow
from typing import List, Dict, Optional

from collections import defaultdict
from dataclasses import dataclass

os.environ["no_proxy"] = "*"

mlflow.set_tracking_uri("http://198.215.61.34:8153/")
mlflow.set_experiment("s440708_Shiqiu")
load_dotenv()

AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

print("Loaded API key:", AZURE_OPENAI_KEY[:5] + "..." if AZURE_OPENAI_KEY else "MISSING")

Loaded API key: FJ5Gr...


# Config

In [None]:
data_config = {
    "dataset_path": "/work/bioinformatics/s440708/MODULE_3_MATERIALS/data/genehop.json",  
    "output_path": "/work/bioinformatics/s440708/MODULE_3_MATERIALS/outputs/",
    "dataset_name": "genehop"
}

model_config = {           
    #"model_name": AZURE_OPENAI_DEPLOYMENT_NAME,     
    "model_name": "qwen3:4b",   
    'frequency_penalty': 0.0,
    'presence_penalty': 0.0,
    "temperature": 1.,
    "max_tokens": 800,
    "top_p": 1.0,
    
}


# Load data

In [3]:

import json

def load_geneturing(path: str) -> List[Dict[str, str]]:
    with open(path, "r") as f:
        raw_data = json.load(f)

    flat_data = []
    counter = 0
    for task_name, qa_pairs in raw_data.items():
        for question, answer in qa_pairs.items():
            flat_data.append({
                "task": task_name,
                'id': counter,
                "question": question,
                "answer": answer
            })
            counter += 1
    

    return flat_data


# config = Config()
dataset = load_geneturing(data_config['dataset_path'])
print(f"Loaded {len(dataset)} examples from {data_config['dataset_path']}")
TASKS = set() 

Loaded 150 examples from /work/bioinformatics/s440708/MODULE_3_MATERIALS/data/genehop.json


In [4]:
# 3.3 Create the pandas dataframe from the collection of rows

import pandas as pd

def build_dataframe(flat_data: List[Dict[str, str]]) -> pd.DataFrame:

    df = pd.DataFrame(flat_data)
    
    df = df[["id", "task", "question", "answer"]]  
    df.set_index("id", inplace=True)
    return df

df = build_dataframe(dataset)
print(f"DataFrame shape: {df.shape}")
display(df.head())

DataFrame shape: (150, 3)


Unnamed: 0_level_0,task,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,sequence gene alias,What are the aliases of the gene that contains...,"[SLC38A6, NAT-1, SNAT6]"
1,sequence gene alias,What are the aliases of the gene that contains...,"[FCGR3A, CD16, FCG3, CD16A, FCGR3, IGFR3, IMD2..."
2,sequence gene alias,What are the aliases of the gene that contains...,"[FNDC11, C20orf195]"
3,sequence gene alias,What are the aliases of the gene that contains...,"[EOLA2, CXorf40B]"
4,sequence gene alias,What are the aliases of the gene that contains...,"[PSMB10, LMP10, MECL1, PRAAS5, beta2i]"


# Create a Structured Output

In [None]:
system_message = (
    "You are a genomics assistant that returns structured answers. "
    "You will be provided with a gene-related question and should return your answer "
    "in structured JSON format following a schema including task, question, answer, and explanation."
)

In [None]:
from pydantic import BaseModel, Field
from typing import Optional

import os
import pprint as pp
from openai import AzureOpenAI

class GeneHopAnswer(BaseModel):
    """
    Structured answer format for gene-level LLM QA.
    """
    task: str = Field(..., description="GeneHop task name, e.g., 'Gene alias'")
    question: str = Field(..., description="The user query")
    answer: str = Field(..., description="Structured, atomic answer returned by the model")
    explanation: Optional[str] = Field(
        None,
        description="Optional reasoning or justification provided by the model"
    )



query = {
    "task": "SNP gene function",
    "question": "What is the function of the gene associated with SNP rs1217074595? Let's decompose the question to sub-questions and solve them step by step.",
}

prompt = f"""Here is a question: "{query['question']}". 
Return the answer in valid JSON format matching this schema: 
{{"task": str, "question": str, "answer": str, "explanation": Optional[str]}}.
The 'task' should be '{query['task']}'.
"""

# few_shot_examples = [
#     {"role": "user", "content": "What are the aliases of the gene that contains this sequnece:"
#      "ACTTCCAACATGGCGGCGGCCGGGGCGGCGGTGGCGCGCAGCCCGGGAATCGGAGCGGGACCTGCGCTGAGAGCCCGGCGCTCGCCCCCGCCGCGGGCCGCACGGCTGCCGCG."
#      "Let's decompose the question to sub-questions and solve them step by step."},
#     {"role": "assistant", "content": "QSOX2, SOXN, QSCN6L1"},
#     {"role": "user", "content": "List chromosome locations of the genes related to Hemolytic anemia due to phosphofructokinase deficiency. Let's decompose the question to sub-questions and solve them step by step."},
#     {"role": "assistant", "content": "21q22.3"},
#     {"role": "user", "content": "What is the function of the gene associated with SNP rs1241371358? Let's decompose the question to sub-questions and solve them step by step."},
#     {"role": "assistant", "content": "Predicted to be active in cytosol."}
# ]

messages = [{"role": "system", "content": system_message},
                {"role": "user", "content": prompt}]



# client = AzureOpenAI(
#     api_key=os.getenv("AZURE_OPENAI_KEY"),
#     api_version="2024-08-01-preview",
#     azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
#     timeout=10,
#     max_retries=3,
# )

# response = client.beta.chat.completions.parse(
#     model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
#     messages=messages,
#     temperature=0,
#     max_tokens=1024,
#     response_format=GeneHopAnswer,
# )

from ollama import chat
response = chat(
  messages=messages,
  model= model_config['model_name'],
  format=GeneHopAnswer.model_json_schema(),
)

genehop_answer = GeneHopAnswer.model_validate_json(response.message.content)
answer = genehop_answer.model_dump()
pp.pprint(answer)

# pp.pprint(response.model_dump())


{'answer': 'The gene associated with SNP rs1217074595 is A2ML1, which is '
           'involved in the regulation of cell adhesion and migration, and is '
           'linked to cancer progression.',
 'explanation': 'rs1217074595 is a single nucleotide polymorphism (SNP) '
                'located in the A2ML1 gene. This gene plays a role in cell '
                'adhesion and migration, and its variation has been associated '
                'with cancer progression. The specific SNP may affect the '
                'function of the protein encoded by A2ML1, potentially '
                'influencing cellular processes related to tumor growth and '
                'metastasis.',
 'question': 'What is the function of the gene associated with SNP '
             'rs1217074595?',
 'task': 'SNP gene function'}


# Create Query Model Function

In [None]:
def query_model(
    system_message: dict,
    user_query: str,
) -> str:
    from ollama import chat
    
    prompt = f"""Here is a question: "{user_query['question']}". 
    Return the answer in valid JSON format matching this schema: 
    {{"task": str, "question": str, "answer": str, "explanation": Optional[str]}}.
    The 'task' should be '{user_query['task']}'.
    """

    messages = [{"role": "system", "content": system_message},
                {"role": "user", "content": prompt}]


    response = chat(
        messages=messages,
        model= model_config['model_name'],
        format=GeneHopAnswer.model_json_schema(),
        )
    
    genehop_answer = GeneHopAnswer.model_validate_json(response.message.content)
    answer = genehop_answer.model_dump()
    return answer

In [None]:
from tqdm import tqdm
from typing import Any, Callable, Dict, List, Optional, Union, cast
import json

def collect_structured_output(
    # client: Any,
    dataset: List[Dict[str, str]],
    system_message: Dict[str, str],
) -> List[Dict[str, str]]:

    answers = []

    with mlflow.start_run(run_name="geneturing-eval"):
        mlflow.log_param("model", model_config["model_name"])
        mlflow.log_param("temperature", model_config["temperature"])
        mlflow.log_param("max_tokens", model_config["max_tokens"])
        mlflow.log_param("top_p", model_config["top_p"])

        for item in tqdm(dataset, desc="Evaluating"):
            item_id = item.get("id")
            task = item.get("task")
            question = item.get("question")
           
            #gold_answer = item.get("answer")
            query = {'task': task, "question": question}
            print(query)
            # try:

            answer = query_model(
                system_message, question, query
            )
            answer['success'] = True
            answer['id'] = item_id
                

            # except Exception:
            #     answer = {}
            #     answer['success'] = False
            #     answer['id'] = item_id
            #     answer['task'] = task
            #     answer['question'] = question
            #     answer['answer'] = ""
            #     answer['explanation'] = ""
        

            answers.append(answer)

        # Calculate and log metrics
        df = pd.DataFrame(answers)
        success_rate = df["success"].mean()
        print(success_rate)
        #valid_scores_df = df[df["score"].notna()]
        #overall_score = valid_scores_df["score"].mean()

        mlflow.log_metric("success_rate", success_rate)
        #mlflow.log_metric("overall_score", overall_score)

        # for task_name, task_df in df.groupby("task"):
        #     mlflow.log_metric(
        #         f"score_{task_name.replace(' ', '_')}", task_df["score"].mean()
        #     )

        # Save to json and log as artifact
        result_json_path = os.path.join(data_config['output_path'], f"{data_config['dataset_name']}_results.csv")
        with open(result_json_path, 'w') as f:
            json.dump(answers, f, indent=2)
      
        # df.to_csv(result_csv_path, index=False)
        mlflow.log_artifact(result_json_path)

    return answers

In [None]:
results = collect_structured_output(dataset, system_message[0])

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GTAGATGGAACTGGTAGTCAGCTGGAGAGCAGCATGGAGGCGTCCTGGGGGAGCTTCAACGCTGAGCGGGGCTGGTATGTCTCTGTCCAGCAGCCTGAAGAAGCGGAGGCCGA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   1%|          | 1/150 [00:34<1:25:02, 34.24s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATTGTGAGAGTAACCAACGTGGGGTTACGGGGGAGAATCTGGAGAGAAGAGAAGAGGTTAACAACCCTCCCACTTCCTGGCCACCCCCCTCCACCTTTTCTGGTAAGGAGCCC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   1%|▏         | 2/150 [01:01<1:13:38, 29.85s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GGAAGAGGCCCCAGCACTGACCTCCGTGGGGGTGGAGATGAGGAGGATGGAAAGGGTGTCTTCCTCCAGCATCTTCCTGAAGGTGAAGGAGGGGCACCTTGGGGTGTCTAAGA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   2%|▏         | 3/150 [01:31<1:13:44, 30.10s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GCACACGTACGTTCCTCATGAAAGGGACGACGGGAGCTGCATGAAAGCCGAAGTTATGGACCGCTAGCATCTGTCACTGGCCACCGGTTTCCGGGAGTAAGCGGCAGCTACCT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   3%|▎         | 4/150 [01:58<1:10:30, 28.98s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGACGTGAAGCCTAGCAGAGGACTTTTTAGCTGCTCACTGGCCCCGCTTGTCTGGCCGACTCATCCGCCCGCGACCCCTAATCCCCTCTGCCTGCCCCAAGATGCTGAAGCCA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   3%|▎         | 5/150 [02:31<1:13:16, 30.32s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ACTTCCAACATGGCGGCGGCCGGGGCGGCGGTGGCGCGCAGCCCGGGAATCGGAGCGGGACCTGCGCTGAGAGCCCGGCGCTCGCCCCCGCCGCGGGCCGCACGGCTGCCGCG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   4%|▍         | 6/150 [03:02<1:13:47, 30.74s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ACTAGCCCCAAGAACAAAAGAGGCACAGGTGGGAACAACTCTCCCAAAACCAGGACTGGGAGCATGGCCAAACTTCATAGTGAGCTTACTTGCCTCTGACACACAAGGCAGCA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   5%|▍         | 7/150 [03:33<1:12:58, 30.62s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GCCGGCGCGCTTGCGCAGTAGCTGAACGCGGGCGTTTCTTTCCTCCCTTTTTTTCGAATTGGTTTTGGGGGTAGATTCGAGTTACAAAATGGCCGCCCGGAGCGTGTTCGGCG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   5%|▌         | 8/150 [04:02<1:11:28, 30.20s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATGCTTCATTACACCCTTCATGGAAGCTGGCACACTCGCCCCACCCTCGTTTTCCACGGGCCAGTCTTGGAGGAAAAGGCCAGCAGGTAGGTTCCCAGGGCACGGAGATGATG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   6%|▌         | 9/150 [04:35<1:13:03, 31.09s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GCTTACGCCACGGCGTCTGCTGGCGGCCGCGGAGACGCAGAGTCTTGAGCAGCGCGGCAGGTGAGTAGCTGTGCGAATTCGGTTCTCTAGGGAGCTCCTTCTTCGCCTGCTGG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   7%|▋         | 10/150 [05:08<1:13:38, 31.56s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGAAGAGCCAAAACAGGAACCGAGGTGGCAAATCACTGTGCGAGGGCGAGTGGACCTCCCTCTTTGCCTCCTCCCTGTTCCAGGAGCTGGTGCCCTGGGCTCTGCGCTGTTGT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   7%|▋         | 11/150 [05:39<1:13:07, 31.56s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:CTTCCAGGCTAAGTCCATCTTCCGGCTTGGGCAGACGCTGCCGCGGAATCCTTGACTCTAGTTTTCTGAGTCGGTGAGTGAGTGCAAAGTAGATTCCTCAGGTGGAGGGTGCC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   8%|▊         | 12/150 [06:22<1:20:00, 34.79s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:CCTTTTAGTATCCAATTATCTGAAACTTAAGAAGAGTGTGCACCGCCCAATGGGTGTGTGTATGTGCTGCTTTGAACCTATAGTTGAGATCCAGAGAATTGGGAGTGACATCA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   9%|▊         | 13/150 [06:46<1:12:24, 31.71s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGGCTACCAAAGTCGCTGGTAGCCTGGACCCTGAGTTCACCGATCCTTCCTACTCAGTAATATCTACAGGTAAGTCCAAAGGGAAATTGCTTTGGGAATCTGACAATTTAACT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:   9%|▉         | 14/150 [07:17<1:11:04, 31.35s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATCTGCTCTGACTTCCCCAGGACGTGTCTGTGCTCCTGTGTGTGACCAGGGTGAGTGGCAACCTGGGAGCCAGAGGGCACAGGGGAATGGGAAGATTAAGGGAGGCTTCAGAG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  10%|█         | 15/150 [07:43<1:07:07, 29.84s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGAGCCCAGCCCAGCACGGTCTCCGGGGATGTAGCTGGTGGGACAGTGAGCAGAGGGCTGGGCCCTGTGCCTGGGGACTGTGGCCTGGAGGCTGAGCAGGAGCTGAGGAGGGG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  11%|█         | 16/150 [08:08<1:03:07, 28.27s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GCACAAGCCGGCCTTGAAATCAGAGCCTTTCCAGCAACTCCGAGAGCGTGTGCTCGGCGACCGCGGGCTTGGCCAGCGGCGCGCGCTCGGCGCCCCGGCGCCCCCAGCCCCAC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  11%|█▏        | 17/150 [08:34<1:01:14, 27.62s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGTTGGCCGCTCCGGCCTGTGGGGCTACATCCCTGCTGCCGGCCAGGCGCGGCCCGCGGGACCCCGAGTGTAGCGCCATGGCCCGGGAGAGGCCGCCCGGGAGGGGCTGCGGC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  12%|█▏        | 18/150 [09:06<1:03:41, 28.95s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GCATTGTGGGTTCTCCTGGAGCTGTGGAGTTGATCCTGAATGAAAGTGGCGCGCCGCCCCTGACGTTACCCGGATCGGAGAGGTTGGAATTCAGATTACGGCTGCGATTCGGG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  13%|█▎        | 19/150 [09:40<1:06:53, 30.64s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:CTTCCCCAAGCCAACGTCTCCGCCGTCGGCTCCGCGGCGCCGCCATGGCCGACGTGGAAGACGGAGAGGAAACCTGCGCCCTGGCCTCTCACTCCGGGAGCTCAGGCTCCAAG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  13%|█▎        | 20/150 [10:13<1:07:28, 31.15s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AAGACTGCGAGCTCCCCGCACCCCCTCGCACTCCCTCTGGCCGGCCCAGGGCGCCTTCAGCCCAACCTCCCCAGCCCCACGGGCGCCACGGAACCCGCTCGATCTCGCCGCCA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  14%|█▍        | 21/150 [10:42<1:05:32, 30.49s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGCTCCCCGCCGGGCTCGCGCCGCAGAGGCCGGTGAGGCGCCGGCGGCCACGCCGCGGAAGGCGCGGGCCGAGCAGAGCCGGGCGTTGGAGCCCGCGCGCGCATGGAGGCGTT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  15%|█▍        | 22/150 [11:14<1:06:06, 30.99s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGATCTGCTTCTCCCCGGCTCGGGGCGCCGAGGCGGCGTCCGGGAGGTGTCTTCTGCAAAGGTTGCCTGGCGCTGTCCAACATGGAGGAGGCACCGGCAGCGGGCGTTACCTG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  15%|█▌        | 23/150 [11:41<1:03:25, 29.96s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:CATGGCCACCCCTGCAGCAGTTACAACACTAAGAGCTCGTTCTTTTGCATCCTCTGTAAGTCCCAGGGCCAAGGGAGGGGAGCCTAGGCTATATTGGACCTAGATAGGAAGAG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  16%|█▌        | 24/150 [12:14<1:04:18, 30.62s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GGGACTTTGCAGGCAGCGGCGGCCGGGGGCGGAGCGGGATCGAGCCCTCGCCGAGGCCTGCCGCCATGGGCCCGCGCCGCCGCCGCCGCCTGTCACCCGGGCCGCGCGGGCCG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  17%|█▋        | 25/150 [12:39<1:00:41, 29.13s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:CTCTTTCTTAAGTCATTCGAAACACTGTCCCTGCGAGTTCTTTAAGTTCCCTGCAATCTGTACACAAGAGAAAGAGGGAGAGAGAGAGGGAGAGAGACAGAGAGAGCAGGGAT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  17%|█▋        | 26/150 [13:08<59:47, 28.93s/it]  

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGAAGGAGCGAAACATCTTTGAGCAAGATGGGTCTCTACCGCATCCGCGTGTCCACTGGGGCCTCGCTCTATGCCGGTTCCAACAACCAGGTGCAGCTGTGGCTGGTCGGCCA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  18%|█▊        | 27/150 [13:36<58:58, 28.77s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:CCCGCGCGGTAGCAGCCAACGCCGGCCCCAGGCGGGTGCGCTGGGAGCCTGGGCCGGGAGCCGGGTGAGGGCGCCGAGAGGCTCGGTGGGCGCGGGCGGCGAGGTGAGCTGGG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  19%|█▊        | 28/150 [14:12<1:03:11, 31.08s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGATTTGCTCTGATTCTGACAATGCAGTGGGTATCAGAGCTCAGGTCCAGAGGAAAAGCCTGGAGTGAGTGAGGCTGCGAGCTTCCTCTTCAGAACTCTGCATGCTTATCGGG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  19%|█▉        | 29/150 [15:08<1:17:37, 38.49s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGAACTCTGCGCTACTGCCCACGCATCTCACAAAGTCTACTACGCCGCGCGTTCTCGGTTTCAACGCACCTCCAGGATTCAGGGCTTCCTAAGCTGCAATTCAGCAGGGCTCC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  20%|██        | 30/150 [15:34<1:09:29, 34.75s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGCGGGAAGACCATCTCTGCAAGTGCAGCATAGCCTCGGCCTAGGACAGCGGGAGTGCGTGGCCAAAGCTGTGAGCAGAGGCACAGGTGGTGGCAGACAGTAGAGGCGCCCCA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  21%|██        | 31/150 [16:13<1:11:17, 35.95s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GTCGGCCCTGCGGGCCTCTCTCCGTCGCCATGGAAACGAAAGCGGCCAAGTAGAGCTCCGTCCTGACGCGCCGCCTCCCGTGGGCTCCGGCCGGCTAAGCCGCGGCGGACAAC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  21%|██▏       | 32/150 [16:55<1:14:27, 37.86s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGTCAAGTGACGCGAAGCGGCCGGCCTGGGCGCCGACTGCAGAGCCGGGAGGCTGGTGGTCATGCCGGGGTTCCTGGTTCGCATCCTCCCTCTGTTGCTGGTTCTGCTGCTTC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  22%|██▏       | 33/150 [17:33<1:13:43, 37.80s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GTTTTTCAGCTCACTTCAAGGGTACCTGAAGCGAATTGGCACCAAAGCAGCAGCTGTATTGCCGCAGTTCTAGCTTCACCTTCACGATGTTTCCCTTGGTCAAAAGCGCACTA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  23%|██▎       | 34/150 [18:17<1:16:23, 39.51s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGAGCGGCCCCGGAGCGGCCGCAGCGCGGTGGTCTCGGCCCGGCTGCGCCAGAGTCCGCGCGATGGAGCCCCGGCCGCGGCGGCGGCGCAGGAGTCGCCCCCTGGTCGCCGCC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  23%|██▎       | 35/150 [20:13<2:00:11, 62.71s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATGACGCGCGGCCGCGCCTGGGGGATGCGGCGGGCGGCGGCGGGGGCGGGCGGAGCGCGGGCGGCGGGGCCAACTGGGGGCGCCTCTCGCCTGCACCCCAATGCGGGACGCAG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  24%|██▍       | 36/150 [20:46<1:41:50, 53.60s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATCGATCACAGGCTCACAACGCCCTTCCATTCATCGCTCTTATTCCAGTCCTGGTTACTACCCCACTTCCTGACATCAAACCAGGGAGAACGCGAGTGTTAGTTAACATGGCT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  25%|██▍       | 37/150 [21:15<1:27:05, 46.25s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GCCATTTTGGATTGTGTGAGTTTCCGGGACGTTCGGAGGGTGGCCTCTCTCCCACCGGGTTCCGCATACCCCAGGCACCGGCCCGCATCCAAGTGTCAGGTTGGAGCCGGGAA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  25%|██▌       | 38/150 [21:59<1:25:20, 45.72s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GTTGTACAGAAAATTGCTGTTGGGATGAAGCTTTGCAGCCTTGCAGTCCTTGTACCCATTGTTCTCTTCTGTGAGCAGCATGTCTTCGCGTTTCAGAGGTAACCCAATAGAA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  26%|██▌       | 39/150 [22:37<1:20:24, 43.47s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATTACTCAGCTTTTCATCTGCTGTGTTGAAAGCCAACTTATCATTTTTCACATTATTGTATACATCTAAAAGACATCATAAACGAAGACTGGAAGTGAAAACTGAAAAACTGA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  27%|██▋       | 40/150 [23:14<1:15:58, 41.44s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GTTTGTTTACTTGGCGAGACTTGGAGCTGAGGTCATTTGGAGCTGTTTAATACTGAAGAGCTGTTGAGCACTGGAAAGTGCTGTGTAACCCTGGAAAAGAACCGTGTAACGCT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  27%|██▋       | 41/150 [23:53<1:13:38, 40.54s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGCTGCCAGCACGCGCAGAAGGCGGACGCAGGCGGGGCGGAAGTGGGCCCGGCGGGCTGGGGCGGCGGGAGTGCGGGTGGGCGTTTAAAGGGGCCTTCGGCACCCAGGTCGGT. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  28%|██▊       | 42/150 [24:22<1:07:11, 37.33s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGATGTGAGTCCTCAATGAGCTATAACCACAGCCATAAATATCTCTCAAAGATGAGGAACATTCTCATGATGTTGACACTGCAATTTTTTGACAATTTCCCAACACTCTTAAG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  29%|██▊       | 43/150 [24:55<1:04:01, 35.91s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:TCTTCCTCCAGCTGTTCCCACAGGCTCCATTATTCAAACTTTGGGGGAGGAAATCAGGGCTGGACAGATCATCAACTGCTGCTGCTGACAGACTGTGTTCCTGCCATGATGGG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  29%|██▉       | 44/150 [25:21<57:55, 32.79s/it]  

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:CATTTCACCAGCCCAGGCTGGCTTCTGCTGTTGACTGGCTGTGGCACCTCAAGCAGCCCCTTTCCCCTCTAGCCTCAGTTTATCACCGCAAGAGCTACCATTCATCTAGCACA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  30%|███       | 45/150 [25:54<57:57, 33.12s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATGGCTTTGCCTAGAAGCCAAGGCCATTGGTCCAACAAAGACATCTTGAGGTTACTGGAATGCATGGAGAATAATCGCCCATCTGATGACAACAGCACGTTTAGCTCAACTCA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  31%|███       | 46/150 [28:08<1:49:33, 63.21s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:GCAGGCGCTGGCTGGCAGGTGTCGCTAACCGGACGGTGGTCGCCAGGGCGAGAGGCGGGAGCCGGAGAGGTGAGGCAGGACCCGGGCTCCACTGCCGCCTCTCCGAGCTCTTG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  31%|███▏      | 47/150 [28:44<1:34:29, 55.05s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ACGTGGCCGGGGTGGGACCTCAGGCGACCTGCGACCATCACTTTGTCTCCTCCTTCCTCCTTTGGGGCCGCCACCGCCAATCAGAGCCAGCGGATCCTGGTTGGAGTGCGACC. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  32%|███▏      | 48/150 [29:22<1:25:12, 50.12s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:AGGATTCCTGGAGTCTCCGGAGGGTGCCTTTGGCCTCTGGGATTCACCCGAGCCGTTGGCTTTTGCTCCCCCCACCCCACCCCCTGGCTTTTCTTGGCTTGGAGGGCAGCTGG. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  33%|███▎      | 49/150 [29:50<1:13:00, 43.37s/it]

{'task': 'sequence gene alias', 'question': "What are the aliases of the gene that contains this sequnece:ATGCTCAGTAGCCGCGGCGCTGCTGCTGGGCTGCTGGGCTGGCGCGGAGTCCACCCTGCCGTCTCCGCCTTGGCTTCTGGGCGTCCAGAAGGCCAGGCATTTGCCGCCTCTGA. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  33%|███▎      | 50/150 [30:21<1:06:00, 39.60s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Hemolytic anemia due to phosphofructokinase deficiency. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  34%|███▍      | 51/150 [30:47<58:34, 35.50s/it]  

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Distal renal tubular acidosis. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  35%|███▍      | 52/150 [31:16<54:41, 33.49s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Pseudohypoparathyroidism Ic. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  35%|███▌      | 53/150 [31:40<49:49, 30.82s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Glycine N-methyltransferase deficiency. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  36%|███▌      | 54/150 [31:59<43:40, 27.30s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Meesmann corneal dystrophy. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  37%|███▋      | 55/150 [32:18<39:13, 24.77s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Chronic atrial and intestinal dysrhythmia. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  37%|███▋      | 56/150 [32:41<37:43, 24.08s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Sensorineural deafness with mild renal dysfunction. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  38%|███▊      | 57/150 [33:10<39:35, 25.54s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Bile acid malabsorption. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  39%|███▊      | 58/150 [33:36<39:25, 25.71s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Immunodeficiency due to defect in MAPBP-interacting protein. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  39%|███▉      | 59/150 [33:57<36:54, 24.33s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Currarino syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  40%|████      | 60/150 [34:14<33:14, 22.16s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Intervertebral disc disease. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  41%|████      | 61/150 [34:45<36:48, 24.82s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Otofaciocervical syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  41%|████▏     | 62/150 [35:16<39:08, 26.69s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Brody myopathy. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  42%|████▏     | 63/150 [35:39<36:53, 25.44s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Neurodevelopmental disorder with gait disturbance. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  43%|████▎     | 64/150 [36:07<37:38, 26.26s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Mitochondrial DNA depletion syndrome 4A Alpers type. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  43%|████▎     | 65/150 [36:31<36:13, 25.57s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Nephropathy due to CFHR5 deficiency. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  44%|████▍     | 66/150 [36:54<34:57, 24.97s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Achromatopsia. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  45%|████▍     | 67/150 [37:17<33:29, 24.21s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Hyperphenylalaninemia. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  45%|████▌     | 68/150 [37:48<35:55, 26.29s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to EDICT syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  46%|████▌     | 69/150 [38:14<35:18, 26.15s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Gastrointestinal defects and immunodeficiency syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  47%|████▋     | 70/150 [38:49<38:37, 28.96s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Cleft palate with ankyloglossia. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  47%|████▋     | 71/150 [39:13<36:12, 27.50s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Neurodevelopmental disorder with nonspecific brain abnormalities and with or without seizures. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  48%|████▊     | 72/150 [39:42<36:20, 27.96s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Spondylocarpotarsal synostosis syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  49%|████▊     | 73/150 [40:03<32:55, 25.65s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Haim-Munk syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  49%|████▉     | 74/150 [40:24<30:55, 24.41s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Sialidosis. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  50%|█████     | 75/150 [40:47<29:47, 23.83s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Siddiqi syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  51%|█████     | 76/150 [41:08<28:26, 23.06s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Corneal fleck dystrophy. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  51%|█████▏    | 77/150 [41:34<29:05, 23.92s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Liver failure. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  52%|█████▏    | 78/150 [42:01<30:02, 25.03s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Proteasome-associated autoinflammatory syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  53%|█████▎    | 79/150 [42:21<27:50, 23.53s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Orofaciodigital syndrome XIV. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  53%|█████▎    | 80/150 [42:39<25:32, 21.90s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Trichoepithelioma. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  54%|█████▍    | 81/150 [43:05<26:23, 22.95s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Medullary thyroid carcinoma. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  55%|█████▍    | 82/150 [43:27<25:47, 22.75s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Type diabetes mellitus. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  55%|█████▌    | 83/150 [43:53<26:23, 23.63s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Buschke-Ollendorff syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  56%|█████▌    | 84/150 [44:13<24:58, 22.70s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Vascular malformation. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  57%|█████▋    | 85/150 [44:44<27:08, 25.06s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Acrocallosal syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  57%|█████▋    | 86/150 [45:04<25:09, 23.59s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Congenital disorder of deglycosylation. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  58%|█████▊    | 87/150 [45:44<29:50, 28.43s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Spinal muscular atrophy with congenital bone fractures. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  59%|█████▊    | 88/150 [46:20<31:53, 30.87s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to B-cell immunodeficiency. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  59%|█████▉    | 89/150 [46:45<29:33, 29.08s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Immunodeficiency with inflammatory disease and congenital thrombocytopenia. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  60%|██████    | 90/150 [47:14<28:49, 28.82s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Split-foot malformation with mesoaxial polydactyly. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  61%|██████    | 91/150 [47:49<30:13, 30.73s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Pigmented nodular adrenocortical disease. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  61%|██████▏   | 92/150 [48:17<29:07, 30.13s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Superoxide dismutase. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  62%|██████▏   | 93/150 [48:41<26:52, 28.29s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Leukoencephalopathy with dystonia and motor neuropathy. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  63%|██████▎   | 94/150 [49:03<24:34, 26.33s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Intracranial hemorrhage in brain cerebrovascular malformations. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  63%|██████▎   | 95/150 [49:31<24:35, 26.83s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Multiple system atrophy. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  64%|██████▍   | 96/150 [50:04<25:48, 28.67s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Cone dystrophy. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  65%|██████▍   | 97/150 [50:37<26:21, 29.84s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Holt-Oram syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  65%|██████▌   | 98/150 [50:58<23:35, 27.21s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Lichtenstein-Knorr syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  66%|██████▌   | 99/150 [51:17<21:05, 24.82s/it]

{'task': 'Disease gene location', 'question': "List chromosome locations of the genes related to Ablepharon-macrostomia syndrome. Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  67%|██████▋   | 100/150 [51:42<20:38, 24.77s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1217074595? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  67%|██████▋   | 101/150 [52:06<20:12, 24.75s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1241371358? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  68%|██████▊   | 102/150 [52:31<19:52, 24.85s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1481036795? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  69%|██████▊   | 103/150 [52:57<19:34, 24.99s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1318850293? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  69%|██████▉   | 104/150 [53:22<19:10, 25.01s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs996319727? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  70%|███████   | 105/150 [53:51<19:44, 26.33s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs577757681? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  71%|███████   | 106/150 [54:15<18:48, 25.64s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1294482311? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  71%|███████▏  | 107/150 [54:38<17:42, 24.70s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs979970652? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  72%|███████▏  | 108/150 [55:02<17:15, 24.66s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1029002401? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  73%|███████▎  | 109/150 [55:27<16:52, 24.69s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1015227? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  73%|███████▎  | 110/150 [56:00<18:00, 27.01s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1278530438? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  74%|███████▍  | 111/150 [56:22<16:39, 25.62s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs745325402? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  75%|███████▍  | 112/150 [56:48<16:16, 25.69s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs4704888? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  75%|███████▌  | 113/150 [57:16<16:23, 26.57s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1201372088? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  76%|███████▌  | 114/150 [57:42<15:45, 26.25s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1324451169? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  77%|███████▋  | 115/150 [58:06<14:52, 25.51s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs983419152? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  77%|███████▋  | 116/150 [58:34<14:58, 26.43s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs745940901? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  78%|███████▊  | 117/150 [59:01<14:35, 26.54s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1303680136? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  79%|███████▊  | 118/150 [59:25<13:46, 25.82s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1053827498? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  79%|███████▉  | 119/150 [59:51<13:19, 25.78s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1350154096? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  80%|████████  | 120/150 [1:00:13<12:16, 24.56s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1037441458? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  81%|████████  | 121/150 [1:00:33<11:18, 23.39s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs900408143? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  81%|████████▏ | 122/150 [1:00:57<10:54, 23.39s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs900532834? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  82%|████████▏ | 123/150 [1:01:19<10:24, 23.14s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1281200566? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  83%|████████▎ | 124/150 [1:01:48<10:45, 24.81s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1431266687? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  83%|████████▎ | 125/150 [1:02:12<10:11, 24.47s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs900745020? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  84%|████████▍ | 126/150 [1:02:37<09:53, 24.73s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1188606225? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  85%|████████▍ | 127/150 [1:03:04<09:48, 25.59s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs902730377? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  85%|████████▌ | 128/150 [1:03:44<10:52, 29.68s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1044115387? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  86%|████████▌ | 129/150 [1:04:14<10:28, 29.94s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs979980368? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  87%|████████▋ | 130/150 [1:04:27<08:14, 24.73s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1161130206? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  87%|████████▋ | 131/150 [1:04:49<07:36, 24.03s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs910422326? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  88%|████████▊ | 132/150 [1:05:17<07:31, 25.08s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1035892430? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  89%|████████▊ | 133/150 [1:05:48<07:36, 26.87s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1255093658? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  89%|████████▉ | 134/150 [1:06:13<06:59, 26.22s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs1257276516? Let's decompose the question to sub-questions and solve them step by step."}


Evaluating:  90%|█████████ | 135/150 [1:06:39<06:32, 26.20s/it]

{'task': 'SNP gene function', 'question': "What is the function of the gene associated with SNP rs563369098? Let's decompose the question to sub-questions and solve them step by step."}


# calculate cosine similarity

In [None]:
import json
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import os

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


tokenizer = AutoTokenizer.from_pretrained("FremyCompany/BioLORD-2023")
model = AutoModel.from_pretrained("FremyCompany/BioLORD-2023")


with open("/work/bioinformatics/s440708/MODULE_3_MATERIALS/outputs/genehop_results.csv", "r") as f:
    pred_data = json.load(f)

with open("/work/bioinformatics/s440708/MODULE_3_MATERIALS/data/genehop.json", "r") as f:
    gold_data = json.load(f)


results = []
for entry in pred_data:
    question = entry['question']
    pred_answer = entry['answer']

    found = False
    for task, qa_dict in gold_data.items():
        for gold_q, gold_a_list in qa_dict.items():
            if gold_q.strip() == question.strip():
                found = True
                for gold_answer in gold_a_list:
                   
                    inputs = tokenizer([pred_answer, gold_answer], padding=True, truncation=True, return_tensors="pt")
                    with torch.no_grad():
                        outputs = model(**inputs)
                    embeddings = mean_pooling(outputs, inputs['attention_mask'])
                    embeddings = F.normalize(embeddings, p=2, dim=1)
                    cosine_sim = F.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
                    results.append({
                        'question': question,
                        'predicted_answer': pred_answer,
                        'gold_answer': gold_answer,
                        'cosine_similarity': cosine_sim
                    })
    if not found:
        results.append({
            'question': question,
            'predicted_answer': pred_answer,
            'gold_answer': None,
            'cosine_similarity': None,
            'note': 'No gold answer found'
        })

import pandas as pd
df = pd.DataFrame(results)


In [None]:
import json
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from tqdm import tqdm
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("FremyCompany/BioLORD-2023")
model = AutoModel.from_pretrained("FremyCompany/BioLORD-2023")

# Helper function for mean pooling
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

# Load gold data
with open("/work/bioinformatics/s440708/MODULE_3_MATERIALS/data/genehop.json", "r") as f:
    gold_data = json.load(f)

# Load predictions
with open("/work/bioinformatics/s440708/MODULE_3_MATERIALS/outputs/genehop_results.csv", "r") as f:
    pred_data = json.load(f)

results = []
gold_entries = []
for task_name, task_dict in gold_data.items():
    
    for question, answers in task_dict.items():
            gold_entries.append({
                "task": task_name,
                "question": question.replace("Let's decompose the question to sub-questions and solve them step by step.","").strip(),
                "gold_answers": answers
            })

# Compute similarity
for i, pred_entry in enumerate(pred_data):
    question = pred_entry['question'].strip()
    predicted_answer = pred_entry['answer'].strip()
    task_name = pred_entry['task']

    match = gold_entries[i]
    gold_answer = match['gold_answers']
    if isinstance(gold_answer, list):
        gold_answer = " ".join(gold_answer)

   
    inputs = tokenizer([predicted_answer, gold_answer], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = mean_pooling(outputs, inputs['attention_mask'])
        embeddings = F.normalize(embeddings, p=2, dim=1)
        cosine_sim = F.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
        print(cosine_sim)
        results.append({
            "task": task_name,
            "question": question,
            "predicted_answer": predicted_answer,
            "gold_answer": gold_answer,
            "cosine_similarity": cosine_sim
        })
    
   

df = pd.DataFrame(results)



100%|██████████| 3/3 [00:00<00:00, 5013.11it/s]


0.22228483855724335
0.12435595691204071
0.2232302576303482
0.17944329977035522
0.241394504904747
0.3022831976413727
0.3317318558692932
0.22889623045921326
0.2430797666311264
0.44673436880111694
0.5258044004440308
0.31004586815834045
0.20812499523162842
0.26675739884376526
0.256971538066864
0.16082169115543365
0.24365420639514923
0.17847153544425964
0.28906893730163574
0.6071949005126953
0.02541375160217285
0.5200710296630859
0.1370559185743332
0.34150242805480957
0.19680416584014893
0.1015663743019104
0.37616950273513794
0.3683997690677643
0.17296752333641052
0.27167296409606934
0.37701550126075745
0.4135938286781311
0.24612268805503845
0.16213443875312805
0.3622819781303406
0.31128329038619995
0.15481258928775787
0.27907854318618774
0.2902039587497711
0.39749252796173096
0.37894707918167114
0.2619859576225281
0.3804163932800293
0.16254813969135284
0.18180790543556213
0.38003015518188477
0.29259175062179565
0.39297351241111755
0.1963275521993637
0.17553435266017914
0.2015819549560547
0

In [30]:
df['cosine_similarity'].mean()

np.float64(0.2703053500503302)

In [36]:

df.to_csv(os.path.join(data_config['output_path'], "genehop_ollama_results.csv"))

# 