# Code for generating the training data for fine-tuning
The dataset generated in this code is also available on huggingface in [SuLLMerica/TeleQnA-prompt-with-context-phinetune](https://huggingface.co/datasets/SuLLMerica/TeleQnA-prompt-with-context-phinetune)

In [13]:
import pandas as pd
import ujson as json
import os
import traceback
from Source.query import Query
from Source.maneger_dataset import get_embeddings_by_labels
from Source.generate_question import generate_questions_training
from Source.enhancement_query import EnhancementQuery
from Source.get_RAG_context import Get_RAG_Context
from tqdm import tqdm
import sqlite3


NUM_CLUSTERS = 18
TOP_K_CLUSTERS = 8
TOP_K_CHUNCKS = 5
DATABASE_PATH="cluster_data_BisectingKMeans_18_250_chunksize.db"
TRAIN_FILE_PATH = "./Data/TeleQnA_training.txt"
TRAIN_DATA_WITH_RAG_PATH = "./Data/intermediates/TeleQnA_Train_With_RAG_Context.json"
PATH_TERMS_FILE = "./Data/TermsAndDefinitions/terms_definitions.json"
PATH_ABBREVIATIONS_FILE = "./Data/TermsAndDefinitions/abbreviations_definitions.json"
MODEL_ID = "microsoft/phi-2"




## Getting the data and generating context (only run if not using our provided data with generated context)
Only run the next two sections if you want to manually generate the RAG context, if not run the [Getting the provided RAG data](#get-rag) section.

In [14]:
# Read both files and concatenate data
with open(TRAIN_FILE_PATH) as f:
    train_data = json.load(f)






## Create a class for enhancement
enhacenment_query = EnhancementQuery(file_name_terms=PATH_TERMS_FILE, file_name_abbreviations=PATH_ABBREVIATIONS_FILE)



train_data_json = []
for question in train_data.keys():
    question_id = int(question.split(" ")[1])

    terms, abreviations = enhacenment_query.define_TA_question(train_data[question]["question"])


    data = {
        "question": train_data[question]["question"],
        "question_id": question_id,
        "terms": terms,
        "abbreviations": abreviations,
        "answer": train_data[question]["answer"],
        "explanation": train_data[question]["explanation"],
        "category": train_data[question]["category"],
        
    }
    if "option 1" in train_data[question]:
        data["option 1"] = train_data[question]["option 1"]
    if "option 2" in train_data[question]:
        data["option 2"] = train_data[question]["option 2"]
    if "option 3" in train_data[question]:
        data["option 3"] = train_data[question]["option 3"]
    if "option 4" in train_data[question]:
        data["option 4"] = train_data[question]["option 4"]
    if "option 5" in train_data[question]:
        data["option 5"] = train_data[question]["option 5"]
    train_data_json.append(data)

    



In [15]:
 # Create a progress bar
pb = tqdm(
    train_data_json,
    total=len(train_data_json),
    desc="Generating RAG Contexts",
    unit="question",
)

# Create a list to store the questions
train_data_json_with_context=[]

# Connect to the SQLite database
conn = sqlite3.connect(DATABASE_PATH)


# Iterate over the question_datas of the DataFrame
for question_data in pb:


    # Get the question
    question = question_data["question"]

    # Get the options
    options = {}
    try:
        option_1 = str(question_data["option 1"])
        if option_1 != "nan" and option_1 != "":
            options["option 1"] = option_1
    except KeyError:
        pass

    try:
        option_2 = str(question_data["option 2"])
        if option_2 != "nan" and option_2 != "":
            options["option 2"] = option_2
    except KeyError:
        pass

    try:
        option_3 = str(question_data["option 3"])
        if option_3 != "nan" and option_3 != "":
            options["option 3"] = option_3
    except KeyError:
        pass

    try:
        option_4 = str(question_data["option 4"])
        if option_4 != "nan" and option_4 != "":
            options["option 4"] = option_4
    except KeyError:
        pass
        
    try:
        option_5 = str(question_data["option 5"])
        if option_5 != "nan" and option_5 != "":
            options["option 5"] = option_5
    except KeyError:
        pass


    # Get the terms and abbreviations
    terms = None
    if str(question_data["terms"]) != "nan" and question_data["terms"] != "":
        terms = question_data["terms"]
    
    abbreviations = None
    if str(question_data["abbreviations"]) != "nan" and question_data["abbreviations"] != "":
        abbreviations = question_data["abbreviations"]
    
    
    # Generate the RAG context

    try:
        context = Get_RAG_Context(question, conn, NUM_CLUSTERS, TOP_K_CLUSTERS, TOP_K_CHUNCKS)
    except Exception as e:
        print(f"An error occurred: {e}")
        print(traceback.format_exc())

    train_data_json_with_context.append({
        "question": question,
        "question_id": question_data["question_id"],
        "options": options,
        "terms": terms,
        "abbreviations": abbreviations,
        "context": context,
        "answer": question_data["answer"],
        "explanation": question_data["explanation"],
        "category": question_data["category"],
    })
    

    

# Close the connection to the database
conn.close()

Generating RAG Contexts:   0%|          | 0/1461 [02:08<?, ?question/s]


## <a id="get-rag"></a>Getting the provided RAG data
Run this section if you skipped the gernerating step earlier.

In [None]:
with open(TRAIN_DATA_WITH_RAG_PATH, "r") as file:
    train_data_json_with_context = json.load(file)

## Preprocess the data for training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)


dataset = []

pb = tqdm(train_data_json_with_context, desc="Processing questions", total=len(train_data_json_with_context), unit="Question")

# Iterate over the rows of the DataFrame
for question_iter in pb:
    # Get the question and answer
    question = str(question_iter["question"])

    try:
        option_1 = str(question_iter["options"]["option 1"])
        option_1_exists = True
    except KeyError:
        option_1 = ""
        option_1_exists = False
    try:
        option_2 = str(question_iter["options"]["option 2"])
        option_2_exists = True
    except KeyError:
        option_2 = ""
        option_2_exists = False
    try:
        option_3 = str(question_iter["options"]["option 3"])
        option_3_exists = True
    except KeyError:
        option_3 = ""
        option_3_exists = False
    try:
        option_4 = str(question_iter["options"]["option 4"])
        option_4_exists = True
    except KeyError:
        option_4 = ""
        option_4_exists = False
    try:
        option_5 = str(question_iter["options"]["option 5"])
        option_5_exists = True
    except KeyError:
        option_5_exists = False
        option_5 = ""

    # Update the question and answer in the DataFrame
    merged_question = (
        (
            question
            + "\n"
            + ("\n1. " + option_1 if option_1_exists else "")
            + ("\n2. " + option_2 if option_2_exists else "")
            + ("\n3. " + option_3 if option_3_exists else "")
            + ("\n4. " + option_4 if option_4_exists else "")
            + ("\n5. " + option_5 if option_5_exists else "")
        )
        + "\n\n"
        + "Choose the correct option from the above options"
    )

    # Prepare the answer
    merged_answer = (
        "The correct option number is "
        + str(question_iter["answer"])
        + "\n"
        + "Explanation: "
        + str(question_iter["explanation"])
    )

    context = ""
    for ret in question_iter["context"]:
        context += ret

    full_context = (
        f"Considering the following context:\n{str(context)}\n"
        + (
            f"Terms and Definitions:\n{question_iter['terms']}\n"
            if question_iter["terms"]
            else ""
        )
        + (
            f"Abbreviations: {question_iter['abbreviations']}\n"
            if question_iter["abbreviations"]
            else ""
        )
    )

    full_question = (
        f"Please provide the answer to the the following multiple choice question:\n{merged_question}\n"
        + "Write only the option number corresponding to the correct answer."
    )

    full_answer = f"The correct option number is {str(question_iter['answer'])}\nExplanation: {str(question_iter['explanation'])}"

    dialogue = tokenizer.apply_chat_template(
        [
            {
                "role": "context",
                "content": full_context,
            },
            {
                "role": "user",
                "content": full_question,
            },
            {
                "role": "assistant",
                "content": full_answer,
            },
        ],
        tokenize=False,
    )

    dataset.append(
        {
            "text": dialogue,
        }
    )




#### An example of the training data

In [None]:
print(dataset[0]['text'])

### Saving the model

In [None]:
df = pd.DataFrame(dataset)
df.to_csv("./Data/intermediates/TeleQnA-prompt-with-context-phinetune.csv", index=False)