In [30]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from typing import Optional
from pydantic import BaseModel, Field
import pandas as pd 
import os 

In [31]:
data_path = "../data/radiologyinfo-QA-extracted-uncleaned.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,topic_name,Question,Answer
0,anal-cancer-therapy,Anal cancer overview,Anal cancer is a cancer that begins in the anu...
1,anal-cancer-therapy,What are my treatment options?,\n\nTreatment options overview\nHow can I choo...
2,anal-cancer-therapy,What happens during radiation therapy?,Radiation therapy uses high energy x-rays (pho...
3,anal-cancer-therapy,What are possible side effects of radiation th...,Side effects of radiation treatment include pr...
4,anal-cancer-therapy,What kind of treatment follow-up should I expect?,"After your treatment has ended, your physician..."


In [85]:
class Question(BaseModel):
    """Directed Question based on Topic and Question Text"""
    question: str = Field(description="The precise and directed question based on topic, question text and answer")

In [82]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You are a helpful assistant generates precise and directed question given a TOPIC_NAME and QUESTION_TEXT.
            The generated question should be readable by a 6th grader, short and strictly related to topic name. Don't hallucinate.
            """
        ),
        (
            "human", 
            """
            TOPIC_NAME : {topic_name}
            QUESTION_TEXT : {Question}
            """
        ),
    ]
)

In [86]:
llm = ChatOllama(model="llama3.1:8b")
structured_llm = llm.with_structured_output(Question)

question_chain = prompt | structured_llm

#input_dict = df.iloc[5].to_dict()
#output = question_chain.invoke(input_dict)


In [87]:
count = 0 
for idx,row in df.iterrows():
    input_dict = row.to_dict()
    output = question_chain.invoke(input_dict)

    print("Original Question:", row['Question'])
    print("Generated Question:", output.question)

    count+=1

    if count == 20:
        break

Original Question: Anal cancer overview
Generated Question: What are the main facts about anal cancer treatment?
Original Question: What are my treatment options?
Generated Question: What are the typical treatments for anal cancer?
Original Question: What happens during radiation therapy?
Generated Question: What happens during anal cancer radiation therapy?
Original Question: What are possible side effects of radiation therapy?
Generated Question: What are possible side effects of radiation therapy in anal cancer treatment?
Original Question: What kind of treatment follow-up should I expect?
Generated Question: What kind of treatment follow-up should I expect for anal cancer therapy?
Original Question: Are there any new developments in treating my disease?
Generated Question: What are the latest advancements in anal cancer therapy treatment?
Original Question: Lung cancer overview
Generated Question: What are the general facts about lung cancer treatments?
Original Question: Pre-treat

In [90]:
%%writefile ../scripts/gen_ques-llm.py
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from typing import Optional
from pydantic import BaseModel, Field
import pandas as pd 
import os 
import argparse

# Define output scheme for the LLM Response
class Question(BaseModel):
    """Directed Question based on Topic and Question Text"""
    question: str = Field(description="The precise and directed question based on topic, question text and answer")

# Define system message for chat
system_message = (
    "system",
    """
    You are a helpful assistant generates precise and directed question given a TOPIC_NAME and QUESTION_TEXT.
    The generated question should be readable by a 6th grader, short and strictly related to topic name. Don't hallucinate.
    """
)

# Define prompt for LLM
prompt = ChatPromptTemplate.from_messages(
    [   
        system_message,
        (
            "human", 
            """
            TOPIC_NAME : {topic_name}
            QUESTION_TEXT : {Question}
            """
        ),
    ]
)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data-path", type=str, help="local path of input dataset")
    parser.add_argument("--output-path", type=str, help="Path to save output file")

    args = parser.parse_args()

    input_data_path = args.input_data_path
    output_path = args.output_path

    assert input_data_path is not None and os.path.exists(input_data_path), f"{input_data_path} does not exists"
    assert output_path is not None, "output path can't be None"

    # Create parent directories
    os.makedirs(os.path.dirname(output_path))
    print("Input Dataset Path:", input_data_path)
    print("Output Path:",output_path)
    
    llm = ChatOllama(model="llama3.1:8b")
    structured_llm = llm.with_structured_output(Question)
    question_chain = prompt | structured_llm
    print("LLM Chain Created")

    df = pd.read_csv(input_data_path)
    
    generated_responses = []
    total = df.shape[0]
    done = 0 
    failed = 0 
    for index, row in df.iterrows():
        try:
            input_dict = row.to_dict()
            response = question_chain.invoke(input_dict)
            generated_responses.append(response.question)
            done += 1
        except:
            generated_responses.append(None)
            failed += 1
        
        if done % 20 == 0:
            print(f"Done={done}/{total} Failed={failed}/{total}")

    df['Generated Question'] = generated_responses
    df.to_csv(output_path, index=False)

Writing ../scripts/gen_ques-llm.py


In [89]:
#! mkdir ../scripts