In [None]:
!pip install openai

In [3]:
# All Imports
from openai import OpenAI
from datasets import load_dataset
import pandas as pd

SEED = 2504

In [5]:
## Load in the shuffled dataset with given seed
dataset = load_dataset("rajpurkar/squad_v2")
dataset.set_format(type="pandas")
dataset_shuffled = dataset.shuffle(seed=SEED) # shuffle dataset
train_dataset = dataset_shuffled["train"] # get the training dataset
train_dataset = train_dataset.to_pandas()
results_file = "data/mistral_7b_v2_instruct_squad_2_results.csv"
try:
    results_df = pd.read_csv(results_file) # Read in results file that is already created

except FileNotFoundError:
    # Results file not created. Need to create it

    required_columns = ["ID", "topic", "question", "context", "true_answer", "llm_answer"] # These are the chosen columns (Can just append more if needed afterwards)
    results_df = pd.DataFrame(columns=required_columns)
    results_df.to_csv(results_file, index=False) # write to csv so can store the results

In [6]:
# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

In [7]:
## Load in results file
results_file = "data/mistral_7b_v2_instruct_squad_2_results.csv"
results_df = pd.read_csv(results_file) # Read in results file that is already created

In [8]:
import time

def generate_llm_answer_lmstudio(question, context, prompt):
    # Construct prompt using the specific question and context for given entry
    formatted_prompt = prompt.format(
        context=context,
        question=question
        )

    llm_full_output = client.chat.completions.create(
        model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        messages=[
            {"role": "user", "content": formatted_prompt},
        ],
        temperature = 0.0, # Temperature set to 0 to help control verbosity
    )

    # Clean the LLM output. Parse to get the output message
    llm_answer = llm_full_output.choices[0].message.content.strip()
    return llm_answer

def get_llm_output_and_store_results(train_dataset, starting_index, prompt, generate_llm_answer):
    batch_size = 25 # Perform a write to csv for every batch processed
    batch_results = []
    counter = 1
    for idx in range(starting_index, len(train_dataset)):
        if idx % 5 == 0 :
            print("Entry : ", idx)

        # Extract the required fields from dataset. (ID, question, answer, context)
        entry_id = train_dataset.iloc[idx]["id"]
        topic = train_dataset.iloc[idx]["title"]
        question = train_dataset.iloc[idx]["question"]
        context = train_dataset.iloc[idx]["context"]
        dataset_answer = train_dataset.iloc[idx]["answers"]["text"]
        start_time = time.time()
        llm_answer = generate_llm_answer_lmstudio(question, context, prompt) # get the LLM answer
        print("Time : ", time.time() - start_time)
        results_entry = [entry_id, topic, question, context, dataset_answer, llm_answer] # Format as required by output file
        batch_results.append(results_entry)

        # Write a batch in one go, quicker than writing every time entry done
        if len(batch_results) == batch_size :
            new_results = pd.DataFrame(batch_results)
            new_results.to_csv(results_file, mode="a", index=False, header=False)
            print("WRITING BATCH")
            batch_results = []
            if counter % 20 == 0:
                break
            counter += 1
        print(llm_answer)
        # break
    
def get_start_index(results_df):
    ## Get which index to resume training from
    return len(results_df)

In [None]:
## generate LLM output
prompt_template = '''Using only the context that is provided, answer the question as accurately as possible. Provide only the answer to the specific question asked, and keep it as short and concise as possible. Do not include any additional information and do not restate the question or context. If you believe the answer is not provided within the context, then you can say that you don't know.
Context : {context}
Question : {question}
'''
starting_index = get_start_index(results_df=results_df)
print("Starting Index", starting_index)
get_llm_output_and_store_results(
    train_dataset=train_dataset,
    starting_index=starting_index,
    prompt=prompt_template,
    generate_llm_answer = generate_llm_answer_lmstudio
    )