In [128]:
import numpy as np
import pandas as pd
import requests
from tqdm.notebook import tqdm
from cataract_doc_study.dependency_setup import user_client, survey_client

In [129]:
docs_df = pd.read_json("/home/rash598/doctor_sys_3/cataract-doc-study/docs.json")
questions_df = pd.read_json("/home/rash598/doctor_sys_3/cataract-doc-study/questions.json")

In [130]:
def get_doc_question_set(doc_id):
    url = "https://cataract-doctor-study-dzb2hfc5h4aqbafk.eastus-01.azurewebsites.net/get_user"
    params = {"user_id": doc_id}  # Replace with actual user ID

    # Make GET request
    response = requests.get(url, params=params)

    # Check status and print result
    if response.status_code == 200:
        data = response.json()
        return data
    return None

def get_question_data(question_id, conditon_id):
    url = "https://cataract-doctor-study-dzb2hfc5h4aqbafk.eastus-01.azurewebsites.net/get_answer"
    params = {"question_id": question_id, "condition_id": conditon_id}  # Replace with actual user ID

    # Make GET request
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return data
    return None

In [131]:
def levenshtein_distance_operations(sent1, sent2):
    # Tokenize sentences into words
    s1 = sent1.strip().split()
    s2 = sent2.strip().split()
    
    m, n = len(s1), len(s2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    ops = [[(0, 0, 0)] * (n + 1) for _ in range(m + 1)]  # (insert, delete, replace)

    for i in range(m + 1):
        dp[i][0] = i
        ops[i][0] = (0, i, 0)

    for j in range(n + 1):
        dp[0][j] = j
        ops[0][j] = (j, 0, 0)

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
                ops[i][j] = ops[i - 1][j - 1]
            else:
                insert = dp[i][j - 1] + 1
                delete = dp[i - 1][j] + 1
                replace = dp[i - 1][j - 1] + 1

                if insert <= delete and insert <= replace:
                    dp[i][j] = insert
                    ins, dele, rep = ops[i][j - 1]
                    ops[i][j] = (ins + 1, dele, rep)
                elif delete <= insert and delete <= replace:
                    dp[i][j] = delete
                    ins, dele, rep = ops[i - 1][j]
                    ops[i][j] = (ins, dele + 1, rep)
                else:
                    dp[i][j] = replace
                    ins, dele, rep = ops[i - 1][j - 1]
                    ops[i][j] = (ins, dele, rep + 1)

    insertions, deletions, substitutions = ops[m][n]
    return dp[m][n], insertions, deletions, substitutions


In [132]:
def extract_info_from_activity_tracker(activity_tracker):
    cumulative_llm_time = 0
    num_comments = 0
    comments_length = []
    for activity in activity_tracker:
        action_type = activity["action_type"]
        if action_type == "update_answer":
            end_time = activity["llm_end_timestamp"]
            start_time = activity["llm_start_timestamp"]
            update_info = activity["update_info"].split(" ")
            duration = end_time - start_time
            cumulative_llm_time += duration
            num_comments += 1
            comments_length.append(len(update_info))
    return num_comments, comments_length, cumulative_llm_time

def process_question_list(question_list, doctor_id, response_dump_df):
    
    for question in question_list:
        # print(question)
        question_id = question["question_id"]
        condition_id = question["condition_id"]
        question_text = questions_df.loc[questions_df["id"] == question_id, "question"].values[0]
        if condition_id == 0:
            original_answer = ""
        else:
            original_answer = questions_df.loc[questions_df["id"] == question_id, "answer"].values[0]
        questiond_data = get_question_data(question_id, condition_id)
        if questiond_data is None:
            continue
        # print(questiond_data)
        final_answer = questiond_data["question_metadata"]["final_answer"]
        duration = questiond_data["question_metadata"]["duration"]
        edit_distance, insertions, deletions, substitutions = levenshtein_distance_operations(original_answer, final_answer)
        if condition_id == 2:
            num_comments, comments_length, cumulative_llm_time = extract_info_from_activity_tracker(questiond_data["activity_tracker"])
        else:
            num_comments = 0
            comments_length = []
            cumulative_llm_time = 0

        question_data = {
            "doctor_id": doctor_id,
            "question_id": question_id,
            "condition_id": condition_id,
            "original_answer": original_answer,
            "final_answer": final_answer,
            "duration": duration,
            "edit_distance": edit_distance,
            "insert": insertions,
            "delete": deletions,
            "substitute": substitutions,
            "num_comments": num_comments,
            "comments_length": comments_length,
            "cumulative_llm_time": cumulative_llm_time
        }
        response_dump_df = pd.concat([response_dump_df, pd.DataFrame([question_data])], ignore_index=True)
    return response_dump_df


In [133]:
response_dump_df = pd.DataFrame(columns=["doctor_id", "condition_id", "question_id", "original_answer", "final_answer", "duration", "edit_distance", "insert", "delete", "substitute", "num_comments", "comments_length", "cumulative_llm_time"])
for index, row in tqdm(docs_df.iterrows(), total=len(docs_df), desc="Fetching question sets"):
    doctor_id = row["id"]
    question_set = get_doc_question_set(doctor_id)
    question_list = question_set["questions_list"]
    response_dump_df = process_question_list(question_list, doctor_id, response_dump_df)

Fetching question sets:   0%|          | 0/5 [00:00<?, ?it/s]

  response_dump_df = pd.concat([response_dump_df, pd.DataFrame([question_data])], ignore_index=True)


In [134]:
response_dump_df

Unnamed: 0,doctor_id,condition_id,question_id,original_answer,final_answer,duration,edit_distance,insert,delete,substitute,num_comments,comments_length,cumulative_llm_time
0,826bbc5d-0522-f5f2-0a1d-a4b60fa8c871,0,253a9583-e13e-843c-a6c2-33a712ff4ea5,,a0,7000.0,1,1,0,0,0,[],0
1,826bbc5d-0522-f5f2-0a1d-a4b60fa8c871,0,c416d70d-64cf-33f6-a985-04f1ee10e7c7,,gasdghasd,12000.0,1,1,0,0,0,[],0
2,826bbc5d-0522-f5f2-0a1d-a4b60fa8c871,1,663e22e4-2bad-53a2-35be-b9af5a804d7a,a5,a5,3000.0,0,0,0,0,0,[],0
3,826bbc5d-0522-f5f2-0a1d-a4b60fa8c871,1,154f340f-34ca-6eea-7170-df44fe19c529,a9,a9,4000.0,0,0,0,0,0,[],0
