In [1]:
import pandas as pd 
import os
from pathlib import Path

In [2]:
data_dir = Path.cwd().parent / "data/data_files"
file_name = "Questionair_for_ChatGPT6_2023_09_03.xlsx"

In [3]:
dfs = pd.read_excel(data_dir / file_name, sheet_name=None)

df1 = dfs['1QnA_General']
df2 = dfs['2Treatment_Modalities']
df3 = dfs['3Site_Specific_AY']

## 1QnA_General

In [5]:
QnA_Keys = ['NIH-NCI Questions from "Radiation Therapy & You" and ChatGPT prompt',
            'NIH-NCI Answers from " Radiation Therapy & You"',
            ]

df1 = df1[QnA_Keys].iloc[3:]
df1.columns = ['Question', 'Answer']
df1.dropna(inplace=True)
print(f"Data Size : {df1.shape[0]}")
df1.head()

Data Size : 29


Unnamed: 0,Question,Answer
3,What is radiation therapy?,Radiation therapy (also called radiotherapy) i...
4,How is radiation therapy given?,Radiation therapy can be external beam or inte...
5,Who gets radiation therapy?,Many people with cancer need treatment with ra...
6,What does radiation therapy do to cancer cells?,"Given in high doses, radiation kills or slows ..."
7,How long does radiation therapy take to work?,Radiation therapy does not kill cancer cells r...


In [6]:
df1['topic_name'] = ['1QnA_General'] * len(df1)
df1.head()

Unnamed: 0,Question,Answer,topic_name
3,What is radiation therapy?,Radiation therapy (also called radiotherapy) i...,1QnA_General
4,How is radiation therapy given?,Radiation therapy can be external beam or inte...,1QnA_General
5,Who gets radiation therapy?,Many people with cancer need treatment with ra...,1QnA_General
6,What does radiation therapy do to cancer cells?,"Given in high doses, radiation kills or slows ...",1QnA_General
7,How long does radiation therapy take to work?,Radiation therapy does not kill cancer cells r...,1QnA_General


## 2Treatment_Modalities

In [7]:
QnA_Keys = [
   'Txt Modalities', 'RSNA/ACR Questions from "RadiologyInfo.org" and ChatGPT prompt',
    'RSNA/ACR Answers from " "RadiologyInfo.org"'
]

df2 = df2[QnA_Keys].iloc[3:]
df2.columns = ['topic_name', 'Question', 'Answer']
df2.dropna(inplace=True)
print("Data Size:", df2.shape[0])
df2.head()


Data Size: 45


Unnamed: 0,topic_name,Question,Answer
3,Ext Beam,What is external beam therapy and how is it used?,"External beam therapy (EBT), also called exter..."
4,Ext Beam,Who operates external beam radiotherapy equipm...,The equipment is operated by a radiation thera...
5,Ext Beam,Is there any special preparation needed for ex...,The process of external beam therapy involves ...
6,Ext Beam,How is external beam therapy procedure performed?,"Before each treatment session, the patient may..."
7,Ext Beam,What will I feel during external beam therapy ...,External beam therapy is painless but patients...


## 3Site_Specific_AY

In [8]:
QnA_Keys = [
    'Txt Sites',
    'RSNA/ACR Questions from "RadiologyInfo.org"',
    'RSNA/ACR Answers from " "RadiologyInfo.org"'
]
df3 = df3[QnA_Keys].iloc[3:]
df3.columns = ["topic_name", "Question", "Answer"]
df3.dropna(inplace=True)
print("Data Size:", df3.shape[0])
df3.head()

Data Size: 41


Unnamed: 0,topic_name,Question,Answer
3,Colorectal,How effective is modern radiation treatment of...,Surgery is the most effective way to treat col...
4,Colorectal,What happens during radiation therapy of colo...,Radiation therapy uses high energy x-rays (pho...
5,Colorectal,"What are the possible side effects, risks, and...",Side effects that develop during treatment var...
6,Colorectal,What kind of treatment follow-up should I expe...,"Colorectal cancer can recur, or reappear, in a..."
7,Esophageal,What happens during radiation therapy of esoph...,"Prior to beginning radiation therapy, some pat..."


In [9]:
prelim_data_df = pd.concat([df1, df2, df3])
print("Total Data Size:", prelim_data_df.shape[0])
prelim_data_df.head()

Total Data Size: 115


Unnamed: 0,Question,Answer,topic_name
3,What is radiation therapy?,Radiation therapy (also called radiotherapy) i...,1QnA_General
4,How is radiation therapy given?,Radiation therapy can be external beam or inte...,1QnA_General
5,Who gets radiation therapy?,Many people with cancer need treatment with ra...,1QnA_General
6,What does radiation therapy do to cancer cells?,"Given in high doses, radiation kills or slows ...",1QnA_General
7,How long does radiation therapy take to work?,Radiation therapy does not kill cancer cells r...,1QnA_General


In [26]:
prelim_data_df.to_csv(data_dir / 'prelim_data_cleaned.csv', index=False)

## Cancer_org Database

In [11]:
prelim_data_path = data_dir / 'prelim_data_cleaned.csv'
cancer_data_path = data_dir / 'webscrapping-data-cleaning.xlsx'

In [12]:
#prelim_data = pd.read_csv(prelim_data_path)
prelim_data = prelim_data_df.copy()
cancer_data = pd.read_excel(cancer_data_path, sheet_name="cancer.org")

prelim_data.shape

(115, 3)

In [13]:
prelim_data.head()

Unnamed: 0,Question,Answer,topic_name
3,What is radiation therapy?,Radiation therapy (also called radiotherapy) i...,1QnA_General
4,How is radiation therapy given?,Radiation therapy can be external beam or inte...,1QnA_General
5,Who gets radiation therapy?,Many people with cancer need treatment with ra...,1QnA_General
6,What does radiation therapy do to cancer cells?,"Given in high doses, radiation kills or slows ...",1QnA_General
7,How long does radiation therapy take to work?,Radiation therapy does not kill cancer cells r...,1QnA_General


In [14]:
cancer_data.head()

Unnamed: 0,topic_name,Question,Cleaned_Answer
0,general,What is a risk factor?,A risk factor is something that raises the cha...
1,general,Can injuries cause cancer?,"Falls, bruises, broken bones, or other such in..."
2,general,Can I bring cancer on myself?,Your personality and emotions cannot cause can...
3,general,Can stress cause cancer?,Researchers have done many studies to see if t...
4,general,Does sugar feed cancer?,Sugar intake has not been shown to increase th...


In [15]:
cancer_data = cancer_data[['topic_name', 'Question', 'Cleaned_Answer']]
cancer_data.rename({"Cleaned_Answer" : "Answer"}, axis=1, inplace=True)

prelim_data = prelim_data[['topic_name', 'Question', 'Answer']]
combined_data = pd.concat([prelim_data, cancer_data])
print(f"Combined Data Size: {combined_data.shape[0]}")

Combined Data Size: 153


In [16]:
combined_data.head()

Unnamed: 0,topic_name,Question,Answer
3,1QnA_General,What is radiation therapy?,Radiation therapy (also called radiotherapy) i...
4,1QnA_General,How is radiation therapy given?,Radiation therapy can be external beam or inte...
5,1QnA_General,Who gets radiation therapy?,Many people with cancer need treatment with ra...
6,1QnA_General,What does radiation therapy do to cancer cells?,"Given in high doses, radiation kills or slows ..."
7,1QnA_General,How long does radiation therapy take to work?,Radiation therapy does not kill cancer cells r...


In [15]:
combined_data.to_csv(data_dir / 'prelim_cancer_data.csv', index=False)

# RadiologyInfo Cleaned Dataset Merge

In [17]:
data_dir = Path.cwd().parent / "data/data_files"

base_file_name = 'prelim_cancer_data.csv'
radiology_data_file_name = 'radiology_info_cleaned_data.csv'

In [18]:
radiology_df = pd.read_csv(data_dir / radiology_data_file_name)
radiology_df.head()

Unnamed: 0,topic_name,Question (A),Generated Question (B),Selected Question (A or B),Answer,Comments/ Modified answers
0,anal-cancer-therapy,Anal cancer overview,What is anal cancer?,B,Anal cancer is a cancer that begins in the anu...,
1,anal-cancer-therapy,What are my treatment options?,What are the treatment options for anal cancer?,B,There are three types of standard treatment fo...,
2,anal-cancer-therapy,What happens during radiation therapy?,What happens during anal cancer therapy?,What happens during radiation therapy for anal...,Radiation therapy uses high energy x-rays (pho...,
3,anal-cancer-therapy,What are possible side effects of radiation th...,What are possible side effects of anal cancer ...,B,Side effects of radiation treatment include pr...,
4,anal-cancer-therapy,What kind of treatment follow-up should I expect?,What treatments for anal cancer typically requ...,What kind of treatment follow-up should I expe...,"After your treatment has ended, your physician...",


In [19]:
def process_data(df):
    question_list = []
    answer_list = []
    topic_name_list = []

    a_count = 0
    b_count = 0 
    rest_count = 0
    for idx, row in df.iterrows():
        if row['Selected Question (A or B)'] == 'A':
            question_list.append(row['Question (A)'])
            a_count+=1
        elif row['Selected Question (A or B)'] == 'B':
            question_list.append(row['Generated Question (B)'])
            b_count+=1
        else:
            question_list.append(row['Selected Question (A or B)'])
            rest_count+=1
        answer_list.append(row['Answer'])
        topic_name_list.append(row['topic_name'])
    
    print("A Count : {} -- B Count : {} -- Rest Count : {}".format(a_count, b_count, rest_count))
    final_df = pd.DataFrame({
        "Question" : question_list, 
        "Answer" : answer_list,
        "topic_name" : topic_name_list
    })
    return final_df

In [20]:
radiology_df_final = process_data(radiology_df)
print("RadiologyInfo Data Size : {}".format(radiology_df_final.shape[0]))

A Count : 45 -- B Count : 68 -- Rest Count : 88
RadiologyInfo Data Size : 201


In [21]:
radiology_df_final.head()

Unnamed: 0,Question,Answer,topic_name
0,What is anal cancer?,Anal cancer is a cancer that begins in the anu...,anal-cancer-therapy
1,What are the treatment options for anal cancer?,There are three types of standard treatment fo...,anal-cancer-therapy
2,What happens during radiation therapy for anal...,Radiation therapy uses high energy x-rays (pho...,anal-cancer-therapy
3,What are possible side effects of anal cancer ...,Side effects of radiation treatment include pr...,anal-cancer-therapy
4,What kind of treatment follow-up should I expe...,"After your treatment has ended, your physician...",anal-cancer-therapy


In [41]:
base_df = pd.read_csv(data_dir / base_file_name)
print("Base Data Size : {}".format(base_df.shape[0]))

Base Data Size : 137


In [22]:
merged_df = pd.concat([combined_data, radiology_df_final])
print("Merged Data Size : {}".format(merged_df.shape[0]))

Merged Data Size : 354


In [23]:
# Remove Duplicated Questions
print("Unique Questions : {}".format(merged_df.Question.unique().shape[0]))
merged_df_unique = merged_df.drop_duplicates(subset=['Question'], keep='first')

print("Final Data Size : {}".format(merged_df_unique.shape[0]))

Unique Questions : 329
Final Data Size : 329


In [24]:
#merged_df_unique.to_csv(data_dir / 'capstone_final_data_v1_with_topic_names.csv', index=False)

In [33]:
# merged_df = pd.concat([combined_data, radiology_df_final])
# print("Merged Data Size : {}".format(merged_df.shape[0]))

Merged Data Size : 354


In [25]:
merged_df_unique.head()

Unnamed: 0,topic_name,Question,Answer
3,1QnA_General,What is radiation therapy?,Radiation therapy (also called radiotherapy) i...
4,1QnA_General,How is radiation therapy given?,Radiation therapy can be external beam or inte...
5,1QnA_General,Who gets radiation therapy?,Many people with cancer need treatment with ra...
6,1QnA_General,What does radiation therapy do to cancer cells?,"Given in high doses, radiation kills or slows ..."
7,1QnA_General,How long does radiation therapy take to work?,Radiation therapy does not kill cancer cells r...


In [29]:
ques2topic = merged_df_unique.set_index('Question')['topic_name'].to_dict()

In [31]:
#ques2topic

In [32]:
eval_df = pd.read_csv(data_dir / 'default_ques_eval.csv')
eval_df.head()

Unnamed: 0,id,session_id,user_question,parsed_question,response,retrieval_similarity,retrieval_relevancy,is_verified,correctness_score,correctness_reasoning,relevance_score,relevance_reasoning,harmfulness_score,harmfulness_reasoning,conciseness_score,conciseness_reasoning,coherence_score,coherence_reasoning,Ground Truth Answer
0,443,8,What is radiation therapy?,What is radiation therapy?,"Radiation therapy, also called radiotherapy, i...",1.0,1.0,False,0.9,**Evaluation Explanation**\n\nThe response pro...,0.9,**Evaluation Explanation:**\n\nIn this evaluat...,1.0,**Evaluation Explanation**\n\nI have evaluated...,0.6,**Evaluation Explanation:**\nThe response prov...,0.9,**Evaluation Explanation:**\n\nI evaluated the...,Radiation therapy (also called radiotherapy) i...
1,444,8,How is radiation therapy given?,How is radiation therapy given?,"Radiation therapy, also called radiotherapy, i...",1.0,1.0,False,0.9,**Evaluation Explanation:**\n\nThe assistant's...,0.9,**Evaluation Explanation:**\n\nI evaluated the...,0.8,**Evaluation Explanation:**\n\nUpon reviewing ...,0.8,**Evaluation Explanation:**\n\nThe assistant's...,0.8,**Evaluation Explanation:**\nThe response prov...,Radiation therapy can be external beam or inte...
2,445,8,Who gets radiation therapy?,Who gets radiation therapy?,Many people with cancer need treatment with ra...,1.0,1.0,False,0.9,**Evaluation Explanation:**\n\nThe assistant's...,0.9,**Evaluation Explanation:**\n\nIn this evaluat...,1.0,**Evaluation Explanation:**\n\nThe response pr...,0.7,**Evaluation Explanation**\n\nThe assistant's ...,0.8,**Evaluation Explanation**\n\nI have evaluated...,Many people with cancer need treatment with ra...
3,446,8,What does radiation therapy do to cancer cells?,What does radiation therapy do to cancer cells?,Radiation therapy is a treatment that kills or...,1.0,1.0,False,0.9,**Evaluation Explanation**\n\nUpon reviewing t...,0.9,**Evaluation Explanation**\n\nI will evaluate ...,0.9,**Evaluation Explanation**\n\nThe response pro...,0.7,**Evaluation Explanation:**\n\nThe response pr...,0.9,**Evaluation Explanation:**\n\nUpon reviewing ...,"Given in high doses, radiation kills or slows ..."
4,447,8,How long does radiation therapy take to work?,How long does radiation therapy take to work?,Radiation therapy is a treatment that uses hig...,1.0,0.9,False,0.8,**Evaluation Explanation:**\nThe assistant's r...,0.9,**Evaluation Explanation:**\nI have evaluated ...,1.0,**Evaluation Explanation**\n\nThe assistant's ...,0.6,**Evaluation Explanation**\n\nThe assistant's ...,0.8,**Evaluation Explanation:**\nThe response prov...,Radiation therapy does not kill cancer cells r...


In [33]:
eval_df['topic_name'] = eval_df['user_question'].apply(lambda x : ques2topic[x])

In [36]:
eval_df.to_csv(data_dir / 'default_ques_eval_with_topic.csv', index=False)

In [44]:
#merged_df_unique.to_csv(data_dir / 'capstone_final_data_v1.csv', index=False)