In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os

os.chdir("/content/drive/MyDrive/Research Project") # change working directory

Mounted at /content/drive


In [None]:
# Colab downloads
!pip install datasets==2.19.1
!pip install ollama==0.2.1
!apt-get install -y pciutils

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
# For ollama google colab 
import subprocess
process = subprocess.Popen("ollama serve", shell=True)

In [None]:
!ollama pull mistral:7b-instruct-v0.3-q5_K_M

In [2]:
# All Imports
from datasets import load_dataset
import pandas as pd
import ollama
import time

SEED = 2504

In [3]:
# Load in the shuffled dataset with given seed
dataset = load_dataset("rajpurkar/squad_v2")
dataset.set_format(type="pandas")
dataset_shuffled = dataset.shuffle(seed=SEED) # shuffle dataset
train_dataset = dataset_shuffled["train"] # get the training dataset
train_dataset = train_dataset.to_pandas()

# Load in results file
results_file = "data/mistral_run2_by_topic.csv"

try:
    results_df = pd.read_csv(results_file) # Read in results file that is already created

except FileNotFoundError:
    # Results file not created. Need to create it
    required_columns = ["ID", "topic", "question", "context", "true_answer", "llm_answer"] # These are the chosen columns (Can just append more if needed afterwards)
    results_df = pd.DataFrame(columns=required_columns)
    results_df.to_csv(results_file, index=False) # write to csv so can store the results

In [None]:
def get_top_topics_df(df, num_topics):
    '''
    This function retrieves (num_topics) many topics that contain the most amount of data points. (n-largest)
    '''
    counts_by_topic = df["title"].value_counts() 
    top_topics = counts_by_topic.nlargest(num_topics).index
    top_topics_df = df[df["title"].isin(top_topics)]
    return top_topics_df.reset_index(drop=True)

# generate 20 topics, more in case we want to use later
results_df = get_top_topics_df(train_dataset, num_topics=20)
results_df.shape

In [None]:
def get_index_resume(results_df):
    # Get which index to resume training from
    return len(results_df)

def generate_llm_response(question, context, prompt):
    formatted_prompt = prompt.format(
        context=context,
        question=question
        )
    response = ollama.generate(
        model='mistral:7b-instruct-v0.3-q5_K_M',
        prompt=formatted_prompt,
        options = {
            "temperature" : 0.0
            }
                           )
    llm_answer = response["response"].strip()
    return llm_answer

def get_llm_output_and_store_res(train_dataset, starting_index, prompt):
    batch_size = 500 # Perform a write to csv every _ question processed
    batch_results = []
    counter = 1
    for idx in range(starting_index, len(train_dataset)):
        if idx % 50 == 0 :
            print("Entry : ", idx)

        # Extract the required fields from dataset. (ID, question, answer, context)
        entry_id = train_dataset.iloc[idx]["id"]
        topic = train_dataset.iloc[idx]["title"]
        question = train_dataset.iloc[idx]["question"].strip()
        context = train_dataset.iloc[idx]["context"].strip()
        dataset_answer = train_dataset.iloc[idx]["answers"]["text"]

        llm_answer = generate_llm_response(question, context, prompt) # get the LLM answer
        results_entry = [entry_id, topic, question, context, dataset_answer, llm_answer] # Format as required by output file
        batch_results.append(results_entry)

        # Write a batch in one go, quicker than writing every time entry done
        if len(batch_results) == batch_size :
            new_results = pd.DataFrame(batch_results)
            new_results.to_csv(results_file, mode="a", index=False, header=False)
            print("WRITING BATCH")
            batch_results = []
            if counter % 20 == 0:
                break
            counter += 1
    new_results = pd.DataFrame(batch_results)
    new_results.to_csv(results_file, mode="a", index=False, header=False)


In [None]:
## generate LLM output
prompt_1 = '''Using ten words or fewer, answer the question using only the context that is provided. If the question can't be answered, then return exactly "Context does not contain the answer.", and nothing else.
Context : {context}
Question : {question}'''

starting_index = get_index_resume(results_df=results_df)
import time
start_time = time.time()
print("Starting Index", starting_index)
get_llm_output_and_store_res(
    train_dataset=results_df,
    starting_index=starting_index,
    prompt=prompt_1
    )
print(time.time() - start_time)