In [1]:
import pandas as pd

# Train Data

In [22]:
train_dir = "data/b6_train_data.csv"

In [23]:
train_df = pd.read_csv(train_dir)
train_df["choices"] = train_df["choices"].apply(lambda x: eval(x))
# Remove all the nan value rows of column answer
train_df = train_df.dropna(subset="answer")
# Drop duplication question
train_df = train_df.drop_duplicates(subset="question")
train_df

Unnamed: 0,task_id,question,choices,answer
0,k10168,Question: What will be output of the following...,"[8 4 2, 8 4 2, 8 4 4, 8 4 3]",C
1,k10173,Question: What will be output of the following...,"[-4, -5, 10, 11]",A
2,k10174,Question: Match the following.\n Group 1 ...,"[P-4. Q-1, R-2, S-3, P-3, Q-1, R-4, S-2, P-3, ...",B
3,k10175,Question: Match the following.\nP. Regular exp...,"[P-4. Q-1, R-2, S-3, P-3, Q-1, R-4, S-2, P-3, ...",B
4,k10176,Question: Which grammar rules violate the requ...,"[1 only, 1 and 3 only, 2 and 3 only, 3 and 4 o...",B
...,...,...,...,...
3958,k00695,Question: Which technique is used in React Memo?,"[Hashing, Memoization, Sorting, Pipelining]",B
3959,k00696,Question: What is the correct definition of pr...,[Concept of passing data as props through inte...,A
3960,k00697,Question: What is the correct syntax for arro...,"[(), =>, ()=>, >=()]",C
3961,k00698,Question: Which of the following is the correc...,"[It is a temporary storage of elements, It is ...",B


Test all the format of answer if it it a list with 4 elements

In [24]:
def check_answer_format(choices):
    return isinstance(choices, list) and (len(choices) != 0)


format_check = train_df["choices"].apply(check_answer_format)
print(f"All answers have correct format: {format_check.all()}")
print(f"Number of incorrect formats: {(~format_check).sum()}")
incorrect_formats = train_df[~format_check]
incorrect_formats


All answers have correct format: True
Number of incorrect formats: 0


Unnamed: 0,task_id,question,choices,answer


In [25]:
# clean the final answer in the right format
train_df["answer"] = train_df["answer"].map(lambda x: x.replace("ANSWER:", "").strip())
train_df["answer"].unique()
# There is only one row which is the answer is G, remove it also
train_df = train_df[train_df["answer"] != "G"]
train_df

Unnamed: 0,task_id,question,choices,answer
0,k10168,Question: What will be output of the following...,"[8 4 2, 8 4 2, 8 4 4, 8 4 3]",C
1,k10173,Question: What will be output of the following...,"[-4, -5, 10, 11]",A
2,k10174,Question: Match the following.\n Group 1 ...,"[P-4. Q-1, R-2, S-3, P-3, Q-1, R-4, S-2, P-3, ...",B
3,k10175,Question: Match the following.\nP. Regular exp...,"[P-4. Q-1, R-2, S-3, P-3, Q-1, R-4, S-2, P-3, ...",B
4,k10176,Question: Which grammar rules violate the requ...,"[1 only, 1 and 3 only, 2 and 3 only, 3 and 4 o...",B
...,...,...,...,...
3958,k00695,Question: Which technique is used in React Memo?,"[Hashing, Memoization, Sorting, Pipelining]",B
3959,k00696,Question: What is the correct definition of pr...,[Concept of passing data as props through inte...,A
3960,k00697,Question: What is the correct syntax for arro...,"[(), =>, ()=>, >=()]",C
3961,k00698,Question: Which of the following is the correc...,"[It is a temporary storage of elements, It is ...",B


In [26]:
# Pick randomly 100 row from train_df
i2choices = {0: "A", 1: "B", 2: "C", 3: "D", 4: "E", 5: "F", 6: "G"}
sampling = False
# If take only subset for training
if sampling:
    sample_df = train_df.sample(n=3813, random_state=42)
else:
    sample_df = train_df.copy()

# Construct template for training
sample_df["choices"] = sample_df["choices"].apply(
    lambda x: [f"{i2choices[i]}: {x[i]}" for i in range(len(x))]
)
sample_df["answer"] = sample_df["answer"].apply(
    lambda x: f'```json\n{{"answer": "{x}"}}\n```'
)
sample_df


Unnamed: 0,task_id,question,choices,answer
0,k10168,Question: What will be output of the following...,"[A: 8 4 2, B: 8 4 2, C: 8 4 4, D: 8 4 3]","```json\n{""answer"": ""C""}\n```"
1,k10173,Question: What will be output of the following...,"[A: -4, B: -5, C: 10, D: 11]","```json\n{""answer"": ""A""}\n```"
2,k10174,Question: Match the following.\n Group 1 ...,"[A: P-4. Q-1, R-2, S-3, B: P-3, Q-1, R-4, S-2,...","```json\n{""answer"": ""B""}\n```"
3,k10175,Question: Match the following.\nP. Regular exp...,"[A: P-4. Q-1, R-2, S-3, B: P-3, Q-1, R-4, S-2,...","```json\n{""answer"": ""B""}\n```"
4,k10176,Question: Which grammar rules violate the requ...,"[A: 1 only, B: 1 and 3 only, C: 2 and 3 only, ...","```json\n{""answer"": ""B""}\n```"
...,...,...,...,...
3958,k00695,Question: Which technique is used in React Memo?,"[A: Hashing, B: Memoization, C: Sorting, D: Pi...","```json\n{""answer"": ""B""}\n```"
3959,k00696,Question: What is the correct definition of pr...,[A: Concept of passing data as props through i...,"```json\n{""answer"": ""A""}\n```"
3960,k00697,Question: What is the correct syntax for arro...,"[A: (), B: =>, C: ()=>, D: >=()]","```json\n{""answer"": ""C""}\n```"
3961,k00698,Question: Which of the following is the correc...,"[A: It is a temporary storage of elements, B: ...","```json\n{""answer"": ""B""}\n```"


In [39]:
# Finetuning template
instruction = """You are a coding assistant that helps to answer multiple choice questions about software development. 
Extract the final answer as a single alphabet option (A,B,C,D,E,F) in json format"""

TEMPLATE = """<|im_start|>system
{instruction}<|im_end|>
<|im_start|>user
{question}
Choices:{choices}<|im_end|>
<|im_start|>assistant
{answer}<|im_end|>
"""


In [40]:
# construc training samples
text_list = []
for i in range(len(sample_df)):
    row = sample_df.iloc[i]
    text = TEMPLATE.format(
        instruction=instruction,
        question=row["question"],
        choices=row["choices"],
        answer=row["answer"],
    )
    text_list.append(text)
sample_df["text"] = text_list

In [41]:
print(sample_df.iloc[0]["text"])

<|im_start|>system
You are a coding assistant that helps to answer multiple choice questions about software development. 
Extract the final answer as a single alphabet option (A,B,C,D,E,F) in json format<|im_end|>
<|im_start|>user
Question: What will be output of the following code?
#include<stdio.h>
int main()
{
    printf("%d\t",sizeof(6.5));
    printf("%d\t",sizeof(90000));
    printf("%d",sizeof('A'));
    return 0;
}
Choices:['A: 8 4 2', 'B: 8 4 2', 'C: 8 4 4', 'D: 8 4 3']<|im_end|>
<|im_start|>assistant
```json
{"answer": "C"}
```<|im_end|>



In [43]:
# output
sample_df.to_csv("data/finetuning_data.csv", index=False)
