In [0]:
import os
import sys
import re
import json
import logging
import itertools
import requests
import ast

from datetime import datetime
from io import StringIO
from types import SimpleNamespace
from dataclasses import dataclass
from collections import OrderedDict
from difflib import SequenceMatcher

import numpy as np
import pandas as pd
import pandas.api.types as ptypes
from openai import OpenAI

from azure.core.credentials import AzureKeyCredential
from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when

# PDF preprocessing

In [0]:
def scale_y_coordinates(bounding_box, vertical_offset):
    # get bounding box as tuplea
    bbox = [(xy['x'], xy['y']) for xy in bounding_box]
    (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bbox
    # scale the coordinates
    abs_x1 = x1
    abs_y1 = vertical_offset + y1
    abs_x2 = x2
    abs_y2 = vertical_offset + y2
    abs_x3 = x3
    abs_y3 = vertical_offset + y3
    abs_x4 = x4
    abs_y4 = vertical_offset + y4
    # absoulte bounding box
    abs_bbox = [{'x': abs_x1, 'y': abs_y1},
                {'x': abs_x2, 'y': abs_y2},
                {'x': abs_x3, 'y': abs_y3},
                {'x': abs_x4, 'y': abs_y4}]
    return abs_bbox


def compute_paragraphs_absolute_coordinates(pdf):
    cumulative_vertical_offset = 0
    # Iterate over each page
    for page in pdf["pages"]:
        page_height = page["height"]
        # Iterate over paragraphs
        for i, para in enumerate(pdf["paragraphs"]):
            para_bbox = para["bounding_regions"][0]["polygon"]
            page_num = para["bounding_regions"][0]["page_number"]
            if page_num > page["page_number"]:
                break
            if page_num == page["page_number"]:
                # absolute coordinates
                abs_bbox = scale_y_coordinates(para_bbox, cumulative_vertical_offset)
                # add absolute coordinates
                para["bounding_regions"][0]["absolute_bbox"] = abs_bbox
                # add the table with absolue coordinates
                pdf["paragraphs"][i] = para
        # update cumulative vertical offset for the next page
        cumulative_vertical_offset += page_height
    return pdf


def compute_tables_absolute_coordinates(pdf):
    cumulative_vertical_offset = 0
    # Iterate over each para
    for page in pdf["pages"]:
        page_height = page["height"]
        for i, table in enumerate(pdf["tables"]):
            table_bbox = table["bounding_regions"][0]["polygon"]
            page_num = table["bounding_regions"][0]["page_number"]
            if page_num == page["page_number"]:
                # absolute coordinates
                abs_bbox = scale_y_coordinates(table_bbox, cumulative_vertical_offset)
                # add absolute coordinates
                table["bounding_regions"][0]["absolute_bbox"] = abs_bbox
                # add absolute coordinates for each cell
                for j, cell in enumerate(table["cells"]):
                    cell_bbox = cell["bounding_regions"][0]["polygon"]
                    cell_abs_bbox = scale_y_coordinates(cell_bbox, cumulative_vertical_offset)
                    cell["bounding_regions"][0]["absolute_bbox"] = cell_abs_bbox
                    table["cells"][j] = cell
                # add the table with absolue coordinates
                pdf["tables"][i] = table
            # update cumulative vertical offset for the next page
        cumulative_vertical_offset += page_height
    return pdf

In [0]:
re.search(  r'^(\d{1,2})\.(\d{1,2})\s+(?!No change)((?!%)[^0-9]*)$', '0.1 1')

In [0]:
def get_toc_page(pdf):
    toc_list = ["table of contents", "chapters", "chapter list", "toc"]
    index_page = -1
    flag_toc = False

    for page in pdf["pages"]:
        num_sections_or_subsections = 0
        lines = page["lines"]

        for i in range(len(lines)-1):
            curr_line = lines[i]["content"].strip()
            next_line = lines[i+1]["content"].strip()

            if curr_line.lower() in toc_list:
                flag_toc = True

            if ( re.match(r'^(\d+)\.0', curr_line, re.IGNORECASE) and next_line.isupper() ) or \
                ( re.match(r'(\d+)\.(\d+)', curr_line, re.IGNORECASE) and re.sub(r"[^A-Za-z]", "", next_line).isalpha() ) or \
                re.match(r'^(\d+)\.0\s+(.*)$', curr_line, re.IGNORECASE) or  re.match(r'^(\d+)\.(\d+)\s+(.*)$', curr_line, re.IGNORECASE):
                num_sections_or_subsections += 1

        if num_sections_or_subsections >= 10:
            index_page = page["page_number"]
            break
    #print(flag_toc, num_sections_or_subsections)
    if flag_toc and index_page != -1:
        return index_page
    else:
        return -1



def extract_sectionwise_text(pdf):
    exh_main_list = ["TITLE", "OBJECTIVES", "TEST PRODUCTS", "METHODOLOGY", "INFORMED CONSENT", "STUDY PANEL", "PANELIST WITHDRAWAL",
                     "PANELIST DISPOSITION AND DEMOGRAPHICS", "PANELIST ACCOUNTABILITY", "ADVERSE EVENTS", "RESULTS", "DEVIATIONS",
                     "CONCLUSION", "REPORT APPROVAL", "APPENDIX"]
    exh_sub_list = ['Product Description', 'Product Use Instructions', 'Compliance', 'Inclusion Criteria', 'Exclusion Criteria',
                    'Panelist Accountability', 'Definition of an Adverse Event', 'Definition of a "Serious" Adverse Event',
                    'Documentation of Adverse Events', 'Evaluations', 'Skin Evaluation by Study Personnel', 'Adverse Events and/or Discomfort mentions',
                    'Additional Information for Adverse Events and Discomfort Mentions', 'Investigator Conclusion', "Dermatologist's Conclusion", "Opthalmologist's Conclusion"]

    main_sections = []
    curr_main_section = None
    curr_subsection = None
    section_number = '0'

    # Regex pattern to match main section headings like "x.0 Section Name"
    main_section_pattern = r'^(\d{1,2})\.0\s+([^0-9]*)$'
    # Regex pattern to match subsection headings like "x.y Subsection Name"
    subsection_pattern = r'^(\d{1,2})\.(\d{1,2})\s+((?!%)[^0-9]*)$'
    # Iterate through each paragraph in the layout result
    toc_page_num = get_toc_page(pdf)
    print('#######################################')
    print('toc_page_number: ',toc_page_num)
    paragraphs = list(filter(lambda para: para["bounding_regions"][0]["page_number"] > toc_page_num, pdf["paragraphs"]))

    for paragraph in paragraphs:
        text = paragraph["content"].strip()
        abs_bbox = paragraph["bounding_regions"][0]["absolute_bbox"]

        if re.match(r"^(Docusign Envelope).*|^(RRPORT-IUS).*", text, re.IGNORECASE): continue

        # Check if the paragraph matches the main section pattern
        main_match = re.match(main_section_pattern, text)
        if main_match:
            # If there was a previous main section, append it to main_sections
            if curr_main_section:
                if curr_subsection:
                    curr_main_section["subsections"].append(curr_subsection)
                    curr_subsection = None
                main_sections.append(curr_main_section)
            # Initialize a new main section
            section_number = main_match.group(1)
            section_name = main_match.group(2)
            curr_main_section = {
                "number": section_number,
                "name": section_name,
                "absolute_bbox": abs_bbox,
                "tables": [],
                "content": [],
                "subsections": [],
            }
            continue

        elif text in exh_main_list:
            # If there was a previous main section, append it to main_sections
            if curr_main_section:
                if curr_subsection:
                    curr_main_section["subsections"].append(curr_subsection)
                    curr_subsection = None
                main_sections.append(curr_main_section)
            # Initialize a new main section
            section_number = str(float(section_number) + 1)
            section_name = text
            curr_main_section = {
                "number": section_number,
                "name": section_name,
                "absolute_bbox": abs_bbox,
                "tables": [],
                "content": [],
                "subsections": [],
            }
            continue

        # Check if the paragraph matches the subsection pattern
        subsection_match = re.match(subsection_pattern, text)
        if subsection_match and curr_main_section:
            # If there was a previous subsection, append it to curr_main_section's subsections
            if curr_subsection:
                curr_main_section["subsections"].append(curr_subsection)
            # Initialize a new subsection
            section_number = subsection_match.group(1)
            subsection_number = subsection_match.group(2)
            subsection_name = subsection_match.group(3)
            curr_subsection = {
                "main_section_number": section_number,
                "subsection_number": subsection_number,
                "name": subsection_name,
                "absolute_bbox": abs_bbox,
                "tables": [],
                "content": [],
            }
            continue

        # elif text in exh_sub_list:
        #     section_number = curr_main_section["number"]
        #     subsection_number = None
        #     subsection_name = text
        #     curr_subsection = {
        #         "main_section_number": section_number,
        #         "subsection_number": subsection_number,
        #         "name": subsection_name,
        #         "absolute_bbox": abs_bbox,
        #         "tables": [],
        #         "content": [],
        #     }
        #     continue

        # If the paragraph doesn't match any pattern, add it to curr_subsection's content if exists
        if curr_subsection:
            curr_subsection["content"].append({"text": text, "absolute_bbox": abs_bbox})
        elif curr_main_section:
            curr_main_section["content"].append({"text": text, "absolute_bbox": abs_bbox})
    # Append the last main section and its last subsection if they exist
    if curr_main_section:
        if curr_subsection:
            curr_main_section["subsections"].append(curr_subsection)
        main_sections.append(curr_main_section)

    return main_sections


# sections = extract_sectionwise_text(pdf)
# sections

In [0]:
def localize_tables_in_sections(pdf, sections):
    sections = deepcopy(sections)
    # get TOC page number
    toc_page_num = get_toc_page(pdf)
    # Filter tables beyond TOC page
    tables = list(filter(lambda x: x["bounding_regions"][0]["page_number"] > toc_page_num, pdf["tables"]))
    # Adjust the indices
    initial_len = len(pdf["tables"])
    final_len = len(tables)
    len_diff = initial_len - final_len

    # Iterate through each section
    for i, section in enumerate(sections):
        curr_sec_y = sections[i]["absolute_bbox"][0]['y']
        subsections = section.get("subsections", [])
        # If the sectio containes subsections
        if subsections:
            for j in range(len(subsections)):
                for k, table in enumerate(tables):
                    table_first_cell_y = table["bounding_regions"][0]["absolute_bbox"][0]['y']

                    # Table between main section para and first subsection para
                    if (j == 0) and (curr_sec_y < table_first_cell_y < subsections[j]["absolute_bbox"][0]['y']):

                        sections[i]["tables"].append(f"/tables/{k+len_diff}")
                        continue

                    # Table between two subsections para of the same main section
                    if (0 < j <= len(subsections)-1) and \
                        (subsections[j-1]["absolute_bbox"][0]['y'] < table_first_cell_y < subsections[j]["absolute_bbox"][0]['y']):

                        sections[i]["subsections"][j-1]["tables"].append(f"/tables/{k+len_diff}")
                        continue

                    # Table between last subsection para and next main section para
                    if (j == len(subsections)-1) and (i != len(sections)-1) and \
                        (subsections[j]["absolute_bbox"][0]['y'] < table_first_cell_y < sections[i+1]["absolute_bbox"][0]['y']):

                        sections[i]["subsections"][j]["tables"].append(f"/tables/{k+len_diff}")
                        continue

                    if (j == len(subsections)-1) and (i==len(sections)-1) and (subsections[j]["absolute_bbox"][0]['y'] < table_first_cell_y):

                        sections[i]["subsections"][j]["tables"].append(f"/tables/{k+len_diff}")
                        continue
        else:
            # If section contains no subsection
            for k, table in enumerate(tables):
                table_first_cell_y = table["bounding_regions"][0]["absolute_bbox"][0]['y']

                # check if table between two main section
                if (i != len(sections)-1) and (curr_sec_y < table_first_cell_y < sections[i+1]["absolute_bbox"][0]['y']):
                    sections[i]["tables"].append(f"/tables/{k+len_diff}")
                    continue

                # check if table in the last main section
                if (i == len(sections)-1) and curr_sec_y < table_first_cell_y:
                    sections[i]["tables"].append(f"/tables/{k+len_diff}")

    return sections

In [0]:
def dataframe_to_table_str(df):
    # Get column names and data as lists
    columns = df.columns.tolist()
    if isinstance(columns[0], tuple):
        columns = [f"{ ' - '.join([str(col) for col in col_tuple]) }" for col_tuple in columns]

    data = df.values.tolist()
    # Calculate maximum width for each column
    max_lengths = [max([len(str(row[i])) for row in data] + [len(str((columns[i])))]) for i in range(len(columns))]
    # Create the table string
    table_str = ''
    # Header row
    header = ' | '.join(f'{columns[i]:^{max_lengths[i]}}' for i in range(len(columns)))
    table_str += f'| {header} |\n'
    # Separator row
    separator = '-+-'.join('-' * max_lengths[i] for i in range(len(columns)))
    table_str += f'+-{separator}-+\n'
    # Data rows
    for row in data:
        table_str += '| ' + ' | '.join(f'{str(item):^{max_lengths[i]}}' for i, item in enumerate(row)) + ' |\n'
        # separate each row by a dash line
        table_str += f'+-{separator}-+\n'  # Move this line out of the loop to only underline the last row

    return table_str


def replace_text_with_table(pdf, sections):
    # Function to sort by bounding box
    def sort_by_bounding_box(bbox):
        x_coordinates = [corner["x"] for corner in bbox]
        y_coordinates = [corner["y"] for corner in bbox]
        x_centroid = sum(x_coordinates)/len(x_coordinates)
        y_centroid = sum(y_coordinates)/len(y_coordinates)
        return (y_coordinates[0], x_coordinates[0])


    sections = deepcopy(sections)
    for i, section in enumerate(sections):
        # check if section contains tables and replace them with markdown
        #section["content"].sort(key = lambda x: (x["absolute_bbox"][0]['y'], x["absolute_bbox"][0]['x']))
        section["content"].sort(key = lambda x: sort_by_bounding_box(x['absolute_bbox']))
        sec_tables = section.get("tables", [])
        if sec_tables:
            for k, table in enumerate(sec_tables):
                table_idx = int( table.split("/")[-1] )
                table_bbox = pdf["tables"][table_idx]["bounding_regions"][0]["absolute_bbox"]
                table_first_cell_y1 = pdf["tables"][table_idx]["cells"][0]["bounding_regions"][0]["absolute_bbox"][0]['y']
                table_last_cell_y4 = pdf["tables"][table_idx]["cells"][-1]["bounding_regions"][0]["absolute_bbox"][3]['y']
                # get the table dataframe
                table_df = TableProcessor(pdf["tables"][table_idx]).to_dataframe()

                start_index = None
                for l, content in enumerate(section["content"], start=0):
                    text_bbox_y1 = content["absolute_bbox"][0]['y']
                    text_bbox_y4 = content["absolute_bbox"][3]['y']
                    if table_first_cell_y1 <= text_bbox_y1 and table_last_cell_y4 > text_bbox_y4:
                        start_index = l
                        break

                end_index = None
                if start_index is not None:
                    for l, content in enumerate(section["content"][start_index+1:], start=start_index+1):
                        text_bbox_y1 = content["absolute_bbox"][0]['y']
                        text_bbox_y4 = content["absolute_bbox"][3]['y']
                        if (l <= len(section["content"])-1) and text_bbox_y1 > table_last_cell_y4:
                            end_index = l-1
                            break
                        if (l == len(section["content"])-1) and text_bbox_y4 <= table_last_cell_y4:
                            end_index = len(section["content"])-1

                if (start_index is not None) and (end_index is not None):
                    sections[i]["content"] = section["content"][:start_index] + \
                                            [{"text": dataframe_to_table_str(table_df), "absolute_bbox":table_bbox}] + \
                                            section["content"][end_index+1:]

        # Check if section has subsections
        subsections = section.get("subsections")
        if subsections:
            for j, subsection in enumerate(subsections):
                #subsection["content"].sort(key = lambda x: (x["absolute_bbox"][0]['y'], x["absolute_bbox"][0]['x']))
                subsection["content"].sort(key = lambda x: sort_by_bounding_box(x['absolute_bbox']))
                # If subsection has tables
                subsec_tables = subsection.get("tables", [])
                if subsec_tables:
                    for k, table in enumerate(subsec_tables):

                        table_idx = int( table.split("/")[-1] )
                        table_bbox = pdf["tables"][table_idx]["bounding_regions"][0]["absolute_bbox"]
                        table_first_cell_y1 = pdf["tables"][table_idx]["cells"][0]["bounding_regions"][0]["absolute_bbox"][0]['y']
                        table_last_cell_y4 = pdf["tables"][table_idx]["cells"][-1]["bounding_regions"][0]["absolute_bbox"][3]['y']
                        # get the table dataframe
                        table_df = TableProcessor(pdf["tables"][table_idx]).to_dataframe()

                        start_index = None
                        for l, content in enumerate(subsection["content"], start=0):
                            text_bbox_y1 = content["absolute_bbox"][0]['y']
                            text_bbox_y4 = content["absolute_bbox"][3]['y']
                            if table_first_cell_y1 <= text_bbox_y1 and table_last_cell_y4 > text_bbox_y4:
                                start_index = l
                                break

                        end_index = None
                        if start_index is not None:
                            for l, content in enumerate(subsection["content"][start_index+1:], start=start_index+1):
                                text_bbox_y1 = content["absolute_bbox"][0]['y']
                                text_bbox_y4 = content["absolute_bbox"][3]['y']
                                if (l <= len(subsection["content"])-1) and text_bbox_y1 > table_last_cell_y4:
                                    end_index = l-1
                                    break
                                if (l == len(subsection["content"])-1) and text_bbox_y4 <= table_last_cell_y4:
                                    end_index = len(subsection["content"])-1

                        if (start_index is not None) and (end_index is not None):
                            sections[i]["subsections"][j]["content"] = subsection["content"][:start_index] + \
                                                                       [{"text": dataframe_to_table_str(table_df), "absolute_bbox":table_bbox}] + \
                                                                        subsection["content"][end_index+1:]
    return sections



def get_sectionwise_text(sections):
    text_dict = {}
    for section in sections:
        text_dict[section["name"]] = {"content": ""}
        if section["content"]:
            text_dict[section["name"]]["content"] = "\u0333".join(section['name']) + "\n\n"
            for content in section["content"]:
                text_dict[section["name"]]["content"] += content["text"] + "\n"
        if section["subsections"]:
            text_dict[section["name"]]["subsections"] = {}
            for subsection in section["subsections"]:
                text_dict[section["name"]]["subsections"][subsection["name"]] = "\n" + f"{subsection['name']}" + "\n\n"
                for subsection_content in subsection["content"]:
                    text_dict[section["name"]]["subsections"][subsection["name"]] += subsection_content["text"] + "\n\n"
    return text_dict



# Llama API

In [0]:
# Llama
class LlamaServer:
    '''
    This class represnts the instances of hosted LLAMA enpoint
    '''
    def __init__(self, endpoint: str, api_key: str):
        '''
        Takes llama endpoint and the api token as input and initializes a LLAMA instance
        Args:
            enpoint: Endpoint of hosted LLAMA
            api_key: Access token to query the LLAMA endpoint
        '''
        self.llama_enpoint = endpoint
        self.llama_key = api_key

    def chat_completion(self, messages, **gen_config):
        '''
        Chat with the hosted LLAMA. Takes input as a list of messages to query the llama and returns the response obtained
        Args:
            messages: A list of messages to query hosted LLAMA
            **gen_config: Generation parameters for LLAMA
        Returns:
            response: Response from the LLMA server
        '''
        status =""
        try:
            llama_headers = {
            "Content-Type": "application/json",
            "Authentication": f"Bearer {self.llama_key}"
            }
            llama_data = {
                "messages": messages,
                **gen_config
            }
            llama_response_obj = requests.post(self.llama_enpoint, headers=llama_headers, data=json.dumps(llama_data))
            if llama_response_obj.status_code == requests.codes.ok:
                status = "Success"
                llama_response = llama_response_obj.json()['choices'][0]['message']['content']
                return self._fix_json_response(llama_response)
        except Exception as e:
            status = f"An error occurred while queryring llama3 serving endpoint: {e}"
            return None

    def _fix_json_response(self, response: str) -> dict:
        """
        Fixes common JSON formatting issues in a string response.

        Args:
            response (str): The response string from LLAMA.

        Returns:
            dict: The JSON-compatible dictionary.
        """
        # Attempt to parse the JSON without any modifications
        try:
            return json.loads(response)
        except json.JSONDecodeError:
            pass  # If it fails, continue with the processing steps

        # Remove markdown JSON code fences and the `json` keyword
        response = re.sub(r'```json\n|```|json', '', response)
        # Replace non-standard quotes with standard double quotes
        response = re.sub(r'“', '"', response)
        response = re.sub(r'”', '"', response)
        response = re.sub(r"'", "\"", response)
        response = re.sub(r"'", "\"", response)
        # Replace invalid fractions with their approximate decimal equivalents
        response = re.sub(r'(\d+)/(\d+)', lambda m: str(float(m.group(1)) / float(m.group(2))), response)
        # Strip leading and trailing whitespace
        response = response.strip()
        # Attempt to find JSON object or array within the string
        match = re.search(r'\{[\s\S]*\}|\[[\s\S]*\]', response)

        if match:
            cleaned_string = match.group(0)
        else:
            # If no JSON object or array is found, assume the whole response needs fixing
            cleaned_string = response

        # Count the number of opening and closing braces
        open_curly = cleaned_string.count('{')
        close_curly = cleaned_string.count('}')
        open_square = cleaned_string.count('[')
        close_square = cleaned_string.count(']')

        # Attempt to add enclosing brackets if missing
        if open_curly == 1 and close_curly == 0:
            cleaned_string += '}'
        elif close_curly == 1 and open_curly == 0:
            cleaned_string = '{' + cleaned_string
        elif open_square == 1 and close_square == 0:
            cleaned_string += ']'
        elif close_square == 1 and open_square == 0:
            cleaned_string = '[' + cleaned_string

        # Handle case where both opening and closing brackets are missing
        if open_curly == 0 and close_curly == 0 and open_square == 0 and close_square == 0:
            cleaned_string = '{' + cleaned_string + '}'

        # Attempt to fix common issues and parse the JSON
        try:
            return json.loads(cleaned_string)
        except json.JSONDecodeError:
            # Handle common issues
            cleaned_string = cleaned_string.replace("'", '"')  # Replace single quotes with double quotes
            cleaned_string = cleaned_string.replace("\n", " ")  # Remove newlines
            cleaned_string = cleaned_string.replace("\t", " ")  # Remove tabs

            try:
                return json.loads(cleaned_string)
            except json.JSONDecodeError:
                try:
                    wrapped_string = f"[{cleaned_string}]"
                    return json.loads(wrapped_string)
                except json.JSONDecodeError:
                    print("Unable to fix JSON response from LLM. The response:\n", response)
                    return response

#Chat prompt


In [0]:
# Prompts for attribute extraction
class ChatPrompt:
    '''
    This class contains different prompts to query LLAMA server in order to extract different information.
    '''
    def _split_evaluation_and_adverse_event_text(self, section_text):
        with StringIO(section_text) as f:
            text_lines = f.readlines()
        i = 0
        for i, line in enumerate(text_lines):
            if re.match(r'^[\d#].*\.\d+', line) and "adverse events" in line.strip().lower():
                adverse_event_section_ix = i
                break

        eval_text = []
        for j in range(i):
            if text_lines[j].startswith("<!--") or text_lines[j].lower().startswith("docusign"):
                continue
            if text_lines[j].startswith("<") or text_lines[j].startswith("!"):
                continue
            eval_text.append(text_lines[j])

        evaluation_text = "".join(eval_text)
        remaining_text = "".join(text_lines[i:])

        return evaluation_text.strip(), remaining_text.strip()


    # def _get_panelist_accountability_text(self, section_text):
    #     parent_key = None
    #     candidate_key = None
    #     is_sub_key = False
    #     is_parent_key = False
    #     for key in section_text.keys():
    #         if bool(re.match(r'panelist accountability', key, re.IGNORECASE)):
    #             is_parent_key = True
    #             candidate_key = key
    #             break
    #         if bool(re.match(r'panelist disposition and demographics', key, re.IGNORECASE)):
    #             parent_key = key
    #             subsections = section_text[parent_key].get("subsections", [])
    #             if subsections:
    #                 for sub_key in section_text[parent_key]["subsections"].keys():
    #                     if bool(re.search(r'panelist accountability', sub_key, re.IGNORECASE)):
    #                         is_sub_key = True
    #                         candidate_key = sub_key
    #                         break
    #             else:
    #                 continue


    #     if not candidate_key:
    #         for key in section_text.keys():
    #             if bool(re.match(r'panelist disposition and demographic', key, re.IGNORECASE)):
    #                 candidate_key = key
    #         mainsec =  section_text.get(candidate_key, "")
    #         if mainsec:
    #             return mainsec.get("content", "")

    #     if is_parent_key:
    #         return section_text[candidate_key].get("content", "")
    #     elif is_sub_key:
    #         return section_text[parent_key]["subsections"][candidate_key]
    #     else:
    #         return ""
    def _get_adverse_events(self, section_text):
        parent_key = None
        candidate_key = None
        is_sub_key = False
        is_parent_key = False
        text = ""
        for key in section_text.keys():
            if bool(re.match(r'adverse event', key, re.IGNORECASE)):
                if isinstance(section_text[key], dict):
                    if "subsections" in section_text[key].keys():
                        for k in section_text[key]["subsections"]:
                            text += section_text[key]["subsections"][k] + "\n\n"
                    else:
                        text += section_text[key]["content"] + "\n"

            if not text and bool(re.match(r'study plan', key, re.IGNORECASE)):
                subsections = section_text[key].get("subsections", {})
                if subsections:
                    for sub_key in section_text[key]["subsections"].keys():
                        if bool(re.search(r'adverse event', sub_key, re.IGNORECASE)):
                            text += section_text[key]["subsections"][sub_key] + "\n"
                else:
                    continue
        return text

    def _get_inclusion_criteria(self, section_text):
        parent_key = None
        candidate_key = None
        is_sub_key = False
        is_parent_key = False
        text = ""
        for key in section_text.keys():
            if bool(re.match(r'inclusion criteria', key, re.IGNORECASE)):
                if isinstance(section_text[key], dict):
                    if "subsections" in section_text[key].keys():
                        for k in section_text[key]["subsections"]:
                            text += section_text[key]["subsections"][k] + "\n\n"
                    else:
                        text += section_text[key]["content"] + "\n"

            if not text and bool(re.match(r'study panel', key, re.IGNORECASE)):
                subsections = section_text[key].get("subsections", {})
                if subsections:
                    for sub_key in section_text[key]["subsections"].keys():
                        if bool(re.search(r'inclusion criteria', sub_key, re.IGNORECASE)):
                            text += section_text[key]["subsections"][sub_key] + "\n"
                else:
                    continue
        return text


    def _get_evaluation(self, section_text):
        parent_key = None
        candidate_key = None
        is_sub_key = False
        is_parent_key = False
        text = ""
        for key in section_text.keys():
            if bool(re.match(r'evaluation', key, re.IGNORECASE)):
                if isinstance(section_text[key], dict):
                    if "subsections" in section_text[key].keys():
                        for k in section_text[key]["subsections"]:
                            text += section_text[key]["subsections"][k] + "\n\n"
                    else:
                        text += section_text[key]["content"] + "\n"

            if not text and bool(re.match(r'study plan', key, re.IGNORECASE)):
                subsections = section_text[key].get("subsections", {})
                if subsections:
                    for sub_key in section_text[key]["subsections"].keys():
                        if bool(re.search(r'evaluation', sub_key, re.IGNORECASE)):
                            print('getting text:')
                            text += section_text[key]["subsections"][sub_key] + "\n"
                else:
                    continue
        return text

    def _get_panelist_accountability_text(self, section_text):
        parent_key = None
        candidate_key = None
        is_sub_key = False
        is_parent_key = False
        text = ""
        for key in section_text.keys():
            if bool(re.match(r'panelist disposition', key, re.IGNORECASE)):
                if isinstance(section_text[key], dict):
                    if "subsections" in section_text[key].keys():
                        for k in section_text[key]["subsections"]:
                            text += section_text[key]["subsections"][k] + "\n\n"
                    else:
                        text += section_text[key]["content"] + "\n"

            if not text and bool(re.match(r'study panel', key, re.IGNORECASE)):
                subsections = section_text[key].get("subsections", {})
                if subsections:
                    for sub_key in section_text[key]["subsections"].keys():
                        if bool(re.search(r'panelist disposition', sub_key, re.IGNORECASE)):
                            text += section_text[key]["subsections"][sub_key] + "\n"
                else:
                    continue

        # if not text:
        #     for key in section_text.keys():
        #         if bool(re.match(r'panelist disposition and demographic', key, re.IGNORECASE)):
        #             if isinstance(section_text[key], dict) and "subsections" in section_text[key].keys():
        #                 for sub_key in section_text[key]["subsections"].keys():
        #                     text + section_text[key]["subsections"][sub_key] + "\n"
        #             else:
        #                 text += section_text[key]["content"] + "\n"

        # if not text:
        #     for key in section_text.keys():
        #         if bool(re.match(r'panelist withdrawal', key, re.IGNORECASE)):
        #             if isinstance(section_text[key], dict) and "subsections" in section_text[key].keys():
        #                 for sub_key in section_text[key]["subsections"].keys():
        #                     if re.match(r"panelist accountability", sub_key, re.IGNORECASE):
        #                         text += section_text[key]["subsections"][sub_key] + "\n"

        return text

    def skin_type_prompt(self, sectionwise_text):
        text = self._get_inclusion_criteria(sectionwise_text)
        demographics = '''Extract all unique mentions of skin types values mentioned as a text  and return it as a list under key Skin_Type.
        Ensure the following restrictions when picking skin types:
        1. Skin type is not fitzpatrick type
        2. Skin type is not skin tones
        3. skin type is not test site
        4. skin type is not ethnicity

        If the skin type is not found, please return an empty list.

        Return the answer strictly in following JSON format with following key:
        1. Skin_Type

        If no text is provided, please return both the keys as null.
        '''

        messages = [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant, who can read some given content and extract attributes from that content in JSON format. You are only capable of responding in JSON format and nothing else"
                },
                {
                    "role": "user",
                    "content": f"Given below is a text about a study that contains panelist information:\n\n{text}\n\n{demographics}"
                },
                {
                    "role": "assistant",
                    "content": "{"
                }
            ]
        return messages
    def instrument_prompt(self, sectionwise_text):
        print("inside instrument prompt")
        text = self._get_evaluation(sectionwise_text)

        demographics = '''Please extract the instrument used from the above text and return it in JSON format with key 'instrument'. If the instrument is not found, please return both the key as null.

        Return the answer strictly in following JSON format:
        {"instrument": list of name of instruments used}

        If no text is provided, please return both the keys as null.'''

        messages = [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant, who can read some given content and extract attributes from that content in JSON format. You are only capable of responding in JSON format and nothing else"
                },
                {
                    "role": "user",
                    "content": f"Given below is a text about a study that contains tables:\n\n{text}\n\n{demographics}"
                },
                {
                    "role": "assistant",
                    "content": "{"
                }
            ]
        return messages


    def fitzpatrick_skin_lip_prompt(self, sectionwise_text):
        '''
        Returns a chat prompt to query LLAMA to extract - Fitzpatrick type, skin tyoe and lip type
        '''
        text = self._get_panelist_accountability_text(sectionwise_text)

        constraints = []
        demographics = """
        Give me the following information from the above text
        {fp_prompt}
        {skin_prompt}
        {lip_prompt}

        Your JSON should have the following keys:
        {fp_key}
        {skin_key}
        {lip_key}
        """

        counter = 0
        if "fitzpatrick" in text.lower():
            counter += 1
            fp_key = f"{counter}. Fitzpatrick_Type"
            fp_var = f"{counter}. Extract all unique mentions of Fitzpatrick skin types mentioned in the respective markdown format content and return it as a list under key Fitzpatrick_Type."
            constraints.append("Ensure that the extraction for Fitzpatrick type focuses solely on Fitzpatrick type names and does not include Skin types or any other classifications.")
        else:
            fp_var = f""
            fp_key = f""
        if "skin" in text.lower():
            counter += 1
            skin_key = f"{counter}. Skin_Tone"
            skin_var = f"{counter}. Extract all unique mentions of skin tone values mentioned in the respective markdown format content and return it as a list under key Skin_Tone."
            constraints.append( "Ensure that the extraction for Skin tone focuses solely on skin tone names and does not include Fitzpatrick types, skin type(normal, dry, oily) or any other classifications.")
        else:
            skin_var = f""
            skin_key = f""
        if "lip" in text.lower():
            counter += 1
            lip_key = f"{counter}. Lip_Type"
            lip_var = f"{counter}. Find all unique mentions of lip types mentioned on the respective markdown format content and return it as a list under key Lip_Type."
            constraints.append("Ensure that the extraction for Lip type focuses solely on Lip type names and does not include Skin types or any other classifications.")
        else:
            lip_var = f""
            lip_key = f""
        demographics = demographics.format(
            skin_prompt=skin_var,
            fp_prompt=fp_var,
            lip_prompt=lip_var,
            skin_key=skin_key,
            fp_key=fp_key,
            lip_key=lip_key,
        )

        for i in range(counter):
            demographics += "\n" + constraints[i]

        messages = [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant, who can read some given content and extract attributes from that content in JSON format. You are only capable of responding in JSON format and nothing else."
                },
                {
                    "role": "user",
                    "content": f"Given below are some tables. Extract the attributes from their respective tables.\n\n{text}\n\n{demographics}"
                },
                {
                    "role": "assistant",
                    "content": "{"
                }
            ]
        return messages


    def number_enrolled_completed_prompt(self, sectionwise_text):
        '''
        Returns a chat prompt to query LLAMA to extract - Number Enrolled and Number Completed
        '''
        text = self._get_panelist_accountability_text(sectionwise_text)

        demographics = '''From the above text,
        Please extract the total number of panelists who enrolled in the study.
        Also, extract the total number of panelists who completed the study.

        Please return the answer in following JSON format:
        {"Number_Enrolled": total Number of panelists who enrolled in the study, "Number_Completed": total Number of panelists who completed the study}

        if any of the numbers are not found, specify it as null.
        If no text is provided. Return the value for both the numbers as null'''

        messages = [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant, who can read some given content and extract attributes from that content in JSON format. You are only capable of responding in JSON format and nothing else"
                },
                {
                    "role": "user",
                    "content": f"Given below is a text about a study that contains tables:\n\n{text}\n\n{demographics}"
                },
                {
                    "role": "assistant",
                    "content": "{"
                }
            ]
        return messages


    def age_range_prompt(self, sectionwise_text):
        '''
        Returns a chat prompt to query LLAMA to extract - Minimum and maximum ages
        '''
        text = self._get_panelist_accountability_text(sectionwise_text)

        demographics = '''Please extract the age range of panelists from the above text and return it in JSON format as minimum-maximum under the keys 'min_age' and 'max_age'. If the age range is not found, please return both the keys as null.

        Please return the answer in following JSON format:
        {"min_age": minimum age, "max_age": maximum age}

        If no text is provided, please return both the keys as null.'''

        messages = [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant, who can read some given content and extract attributes from that content in JSON format. You are only capable of responding in JSON format and nothing else"
                },
                {
                    "role": "user",
                    "content": f"Given below is a text about a study that contains tables:\n\n{text}\n\n{demographics}"
                },
                {
                    "role": "assistant",
                    "content": "{"
                }
            ]
        return messages

    def ethnicity_prompt(self, sectionwise_text):
        '''
        Returns a chat prompt to query LLAMA to extract - list of ethnicities
        '''
        text = self._get_panelist_accountability_text(sectionwise_text)

        demographics = '''From the above text,
        Please extract all unique mentions of ethnicities (e.g., Asian, Hispanic) from the text and return them as a JSON object with an list under the key 'Ethnicity'
        (provide the ethnicity names as mentioned in the text and exlcude the count of panelists).

        Please return the answer in following JSON format:
         {"Ethnicity": list of Ethnicities}

        Here is an example of the response should look like: {"Ethnicity": ["Hispanic", "Indian", "Black African/American"] ...}

        Please ensure that the extraction focuses solely on ethnicity names and does not include Fitzpatrick skin types, skin tones or any other classifications.

        If no ethnicities are found, please ensure that the JSON output still includes the 'Ethnicity' key with an empty list.
        If no text is provided, please ensure that the JSON output still includes the 'Ethnicity' key with an empty list.'''

        messages = [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant, who can read some given content and extract attributes from that content in JSON format. You are only capable of responding in JSON format and nothing else"
                },
                {
                    "role": "user",
                    "content": f"Given below is a text about a study that contains tables:\n\n{text}\n\n{demographics}"
                },
                {
                    "role": "assistant",
                    "content": "{"
                }
            ]
        return messages




    def adverse_event_attribute_prompt(self, sectionwise_text):
        '''
        Returns a chat prompt to query LLAMA and extract - Number of Adverse events
        '''
        adverse_event_text = self._get_adverse_events(sectionwise_text)
        print('adverse event text:',adverse_event_text)


        # if isinstance(sectionwise_text, dict):
        #     result_key = None
        #     adverse_key = None
        #     for i in sectionwise_text.keys():
        #         if bool(re.search(r'study plan',i,re.IGNORECASE)):
        #             result_key = i
        #             if len(sectionwise_text[result_key].get('subsections',{}))>0:
        #                 for j in sectionwise_text[result_key]['subsections']:
        #                     if bool(re.search(r'adverse events',j,re.IGNORECASE)):
        #                         adverse_key = j
        #                         adverse_event_text = sectionwise_text[result_key]['subsections'][adverse_key]
        #                         break
        #                 break

        # if not adverse_event_text:
        #     adverse_event_main_sec = None
        #     for key in sectionwise_text.keys():
        #         if bool(re.search(r"adverse event", key, re.IGNORECASE)) and \
        #             (bool(re.search(r"discomfort mention", key, re.IGNORECASE)) or
        #              bool(re.search(r"mentions of discomfort", key, re.IGNORECASE))):
        #                 adverse_event_main_sec = key
        #                 break
        #         else:
        #             if bool(re.search(r'results',i,re.IGNORECASE)):
        #                 adverse_event_main_sec = i
        #     if adverse_event_main_sec:
        #         adverse_event_text = sectionwise_text[adverse_event_main_sec].get("content", "")


        inst = '''Please extract the number of Adverse Events in the above text.
        Adverse Events represents the number of  panelists who reported discomfort or irritation or adverse reaction to the test product.

        Please return the answer in following JSON format:
        {"Adverse_Events": count of Adverse Events}.'''

        messages = [
                {
                    "role": "system",
                    "content": "You are a helpful AI assistant, an expert at extracting information accurately from the provided text and tables. You are designed to output JSON only (with the right syntax) and nothing else. "
                },
                {
                    "role": "user",
                    "content": f"Given below is a text that contains tables:\n\n{adverse_event_text}\n\n{inst}"
                },
                {
                    "role": "assistant",
                    "content": ""
                }
            ]
        return messages



# Table Attributes

In [0]:
# Process tables
class TableProcessor:
    '''
    Contains methods to process a table obtained from Azure Document Inteliigence response
    '''
    def __init__(self, table):
        '''
        Iniatialize the class with the table obtained from Azure Document Inteliigence response
        Args:
            table:  Table to be processed
        '''
        self.cells = table["cells"]
        self.n_rows = table["row_count"]
        self.n_cols = table["column_count"]

    # Convert to dataframe
    def to_dataframe(self):
        '''
        Converts the table to pandas dataframe
        '''
        col_levels = self._get_column_levels()
        vals = [cell["content"] if cell else None for cell in self.replicate_cells_spanwise()]
        #vals_arr = np.array(vals).reshape(n_rows, n_cols)
        vals_arr = [[vals[i*self.n_cols + j] for j in range(self.n_cols)] for i in range(self.n_rows)]

        df = pd.DataFrame(vals_arr)

        if col_levels == 1:
            df.columns = df.iloc[0]
            df.drop(df.index[:col_levels], axis = 0, inplace=True)
            df = df.reset_index().drop(columns=["index"], axis=1)
            if not df.columns[0]:
                df.columns = ["<rowHeader>"] + df.columns[1:].tolist()

        if col_levels > 1:
            col_arr = [df.iloc[i].tolist() for i in range(col_levels)]
            for i in range(col_levels):
                if not col_arr[i][0]:
                    col_arr[i][0] = "<rowHeader>"
            df.columns = pd.MultiIndex.from_arrays(col_arr, names = [None]*col_levels)
            df.drop(df.index[:col_levels], axis = 0, inplace=True)
            if not df.columns.is_monotonic_increasing:
                df = df.sort_index(axis=1)
            df = df.reset_index().drop(columns=["index"], axis=1, level=0)
        return df

    #Converts the rows of the table into a list of json objects
    def to_json(self):
        '''
        Conerts the table into a list of json object whose each element representing a row in the table
        '''
        df = self.to_dataframe()
        return json.loads(df.to_json(orient="records"))


    def replicate_cells_spanwise(self):
        '''
        Exapnds the table cells to match ( num_rows*num_columns ) in the table
        '''
        cells_new = [None for _ in range(self.n_rows*self.n_cols)]
        for i, cell in enumerate(self.cells):
            r, c = cell["row_index"], cell["column_index"]
            # row and column span of the cell
            row_span = int(cell.get("row_span", 1))
            col_span = int(cell.get("column_span", 1))

            cells_new[r*self.n_cols+c] = cell
            # If mulitple row span - expand along the row
            if row_span and row_span > 1:
                for rs in range(1, row_span):
                    cells_new[(r+rs)*self.n_cols+c] = cell
            # if multiple column span - expand along the column
            if col_span and col_span > 1:
                for cs in range(1, col_span):
                    cells_new[c+cs] = cell

        return cells_new

    # Gets the number of rows as column header from given table cells
    def _get_column_levels(self):
        '''
        Function to determine the number of column levels in the table
        '''
        row_indices = []
        header_cells = [cell for cell in self.cells if cell and cell.get("kind", None) == "columnHeader"]
        for cell in header_cells:
            row_indices.append(cell["row_index"])

        row_indices.sort()
        row_indices = list(set(row_indices))
        i=0
        for i in range(len(row_indices)):
            if i != row_indices[i]:
                break
        if i == 0 or (i == len(row_indices)-1 and i == row_indices[i]):
            i += 1

        num_levels = len(row_indices[:i])
        return num_levels


In [0]:
class TableAttributesExtractor:
    def get_results_section(self, sections):
        for section in sections:
            if section['name'] == 'RESULTS' or section['name'] == 'RESULT':
                sub_sections = section.get("subsections")
                return section,sub_sections

    def get_tables_to_main_section(self, section, sub_sections):
        if len(sub_sections)>0:
            for subsection in sub_sections:
                tables = subsection['tables']
                section['tables'].extend(tables)
        return section

    def get_table_header(self, table, pdf):
        absolute_bbox = table['cells'][0]['bounding_regions'][0]['absolute_bbox']
        cell_content = table['cells'][0]['content']
        paragraphs = pdf['paragraphs']
        para_list = []
        for para in paragraphs:
            para_content = para['content']
            para_absolute_bbox = para['bounding_regions'][0]['absolute_bbox']
            if para_absolute_bbox == absolute_bbox:
                break
            para_list.append(para)
        distance_from_first_cell = []
        for para in para_list:
            para_absolute_bbox = para['bounding_regions'][0]['absolute_bbox']
            max_y_para = np.max([para_absolute_bbox[2]["y"], para_absolute_bbox[3]["y"]]).item()
            min_y_cell = np.min([absolute_bbox[0]["y"],absolute_bbox[1]["y"]]).item()
            distance_from_first_cell.append((min_y_cell - max_y_para))
        if len(distance_from_first_cell)!=0:
            para_index_for_table_heading = distance_from_first_cell.index(np.min(distance_from_first_cell).item())
            return para_list[para_index_for_table_heading]['content']
        else:
            return "Not Available"

    def get_dataframe_from_tables(self, tables, section, pdf,results_section_columns,df_clinical_mapping):
        df = pd.DataFrame()
        results_mapping_dictionary = df_clinical_mapping.set_index("RESULTS")["Attribute Mapped Results"].to_dict()
        for table in tables:
            table_number = int(table.split("/")[-1])
            table_df = TableProcessor(pdf["tables"][table_number]).to_dataframe()
            table_header = self.get_table_header(pdf["tables"][table_number],pdf)
            table_header = re.sub("[^a-zA-Z\s]", "",table_header).strip()
            print(table_header)
            l1 = []
            for column in table_df.columns:
                for row1 in range(len(table_df[column])):
                    l = []
                    if isinstance(column,str):
                        m = column.lower().strip("*: ")
                        if m in results_section_columns:
                            l.extend([results_mapping_dictionary[m],table_df[column][row1],row1,table_number,section['name'],table_header])
                            l1.append(l)
                    elif isinstance(column,tuple):
                        pass
                        # l.extend(list(column))
                        # l.extend([table_df[column][row1],row1,table_number,section['name']])
                        # l1.append(l)
            df1 = pd.DataFrame(l1,columns=["Attribute_Name","Attribute_Value","Row_Number","Table_Number","Section_Name","Table_Header"])
            df = pd.concat([df,df1], ignore_index=True)
        return df

    def get_results_attributes(self, df):
        df = df.copy()
        dfs = []
        table_nums = df["Table_Number"].unique().tolist()
        for table_num in table_nums:
            temp_df = df[df["Table_Number"] == table_num]
            table_dict = {}
            columns = temp_df["Attribute_Name"].unique().tolist()
            for col in columns:
                table_dict[col] = temp_df.loc[temp_df["Attribute_Name"] == col, "Attribute_Value"].tolist()
            dfs.append(pd.DataFrame(table_dict))

        return pd.concat(dfs, ignore_index=True)

    def results_tables(self, pdf, sections):
        section, sub_sections = self.get_results_section(sections)
        section = self.get_tables_to_main_section(section,sub_sections)
        tables = section['tables']
        df = self.get_dataframe_from_tables(tables, section, pdf,results_section_columns,df_clinical_mapping)
        df = self.get_results_attributes(df)
        return df

In [0]:
class AttributeExtractor:
    '''
    This class contains methods to extract various required attributes.
    '''
    def __init__(self, llm: LlamaServer, chat_prompt: ChatPrompt, gen_config:dict):
        '''
        Initializes the instance with inputs required to extract the atrributes
        Args:
            llm: LLAMA instance
            chat_prompt: contains chat prompts to query the llm
            gen_config: a dictionary of parameters to configure llm generation
        '''
        self.llm = llm
        self.chat_prompt = chat_prompt
        self.gen_config = gen_config


    def results_attributes(self, pdf, sections):
        '''
        processes the list of dataframes of results tables and returns a final concatenated dataframe
        Args:
            pdf: json object containing tables and paragraphs of text
        Returns:
            A concatenated dataframe of all results tables
        '''
        table_extractor = TableAttributesExtractor()
        df = table_extractor.results_tables(pdf,sections)
        return df


    def adverse_events(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns adverse events count
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            adverse_event_count: a dictionary wit keys - "Adverse_Events" and values - count of adverse events OR None
        '''
        messages = self.chat_prompt.adverse_event_attribute_prompt(sectionwise_text)
        # print(messages)
        adverse_event_count = None
        try:
            adverse_event_count = self.llm.chat_completion(messages, **self.gen_config)
        except Exception as e:
            print("Adverse event exception:", e)
            pass
        return adverse_event_count


    def number_enrolled_completed_attr(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns number enrolled and completed
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            enrolled_completed: a dictionary containing number enrolled and number completed
        '''
        messages = self.chat_prompt.number_enrolled_completed_prompt(sectionwise_text)
        enrolled_completed = None
        try:
            enrolled_completed = self.llm.chat_completion(messages, **self.gen_config)
        except Exception as e:
            print("Number enrolled, completed Exception:", e)
            pass
        return enrolled_completed

    def instrument_attr(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns age range
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            ages: a dictionary containing minimum and maximum age
        '''
        messages = self.chat_prompt.instrument_prompt(sectionwise_text)
        print(messages)
        instrument = None
        try:
            instrument = self.llm.chat_completion(messages, **self.gen_config)
        except Exception as e:
            print("Instrument Exception:", e)
            pass
        return instrument

    def skin_type_attr(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns age range
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            ages: a dictionary containing minimum and maximum age
        '''
        messages = self.chat_prompt.skin_type_prompt(sectionwise_text)
        print(messages)
        skin_type = None
        try:
            skin_type = self.llm.chat_completion(messages, **self.gen_config)
        except Exception as e:
            print("skin_type Exception:", e)
            pass
        return skin_type

    def age_range_attr(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns age range
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            ages: a dictionary containing minimum and maximum age
        '''
        messages = self.chat_prompt.age_range_prompt(sectionwise_text)
        ages = None
        try:
            ages = self.llm.chat_completion(messages, **self.gen_config)
        except Exception as e:
            print("Age Range Exception:", e)
            pass
        return ages


    def ethnicity_attr(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns dict of list of ethnicities
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            ethnicity: a dictionary containing list of ethnicites under the key "Ethnicity"
        '''
        messages = self.chat_prompt.ethnicity_prompt(sectionwise_text)
        ethnicity = None
        try:
            ethnicity = self.llm.chat_completion(messages, **self.gen_config)
        except Exception as e:
            print("Ethnicity Exception:", e)
            pass
        return ethnicity


    def fitzpatrick_skin_lip_attr(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns dict of lists of Fitzpatrick types, skin types and lip types
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            fitzpatrick_skin_lip: a dictionary containing lists of Fitzpatrick types, Skin types and Lip types
        '''
        messages = self.chat_prompt.fitzpatrick_skin_lip_prompt(sectionwise_text)
        fitzpatrick_skin_lip = None
        try:
            fitzpatrick_skin_lip = self.llm.chat_completion(messages, **self.gen_config)
        except Exception as e:
            print("fitzpatrick_skin_lip Type Exception:", e)
            pass
        return fitzpatrick_skin_lip

    def instrument_attributes(self, sectionwise_text):
        instrument = self.instrument_attr(sectionwise_text)
        attrs_dict = {}
        # instrument
        if isinstance(instrument, dict):
            attrs_dict["Instrument"] = instrument.get("instrument", [])
        elif isinstance(instrument, list) or isinstance(instrument, str):
            attrs_dict["Instrument"] = instrument[:]
        else:
            attrs_dict["Instrument"] = []

        return attrs_dict

    def demographics_attributes(self, sectionwise_text):
        '''
        Takes the sectionwise segregated text as input and returns all the demographics attributes
        Args:
            sectionwise_text: a dictionary with keys as section names and values as text in that section
        Returns:
            attrs_dict: a dictionary containing all the demographic attributes
        '''
        number_enrolled_completed = self.number_enrolled_completed_attr(sectionwise_text)
        age_range = self.age_range_attr(sectionwise_text)
        ethnicity = self.ethnicity_attr(sectionwise_text)
        fitzpatrick_skin_lip = self.fitzpatrick_skin_lip_attr(sectionwise_text)
        skin_type = self.skin_type_attr(sectionwise_text)

        attrs_dict = {}

        # Number Enrolled Completed
        if isinstance(number_enrolled_completed, dict):
            attrs_dict["Number_Enrolled"] = number_enrolled_completed.get("Number_Enrolled", None)
            attrs_dict["Number_Completed"] = number_enrolled_completed.get("Number_Completed", None)
        elif isinstance(number_enrolled_completed, str):
            num_list = re.findall(r'\b\d{2,3}\b', number_enrolled_completed, re.IGNORECASE)
            num_list = list(map(int, num_list))
            attrs_dict["Number_Enrolled"] = max(num_list)
            attrs_dict["Number_Completed"] = min(num_list)
        elif isinstance(number_enrolled_completed, list):
            num_list = list(map(int, num_list))
            attrs_dict["Number_Enrolled"] = max(num_list)
            attrs_dict["Number_Completed"] = min(num_list)
        else:
            attrs_dict["Number_Enrolled"] = None
            attrs_dict["Number_Completed"] = None

        # Age Range
        if isinstance(age_range, dict):
            min_age = age_range.get("min_age", "")
            max_age = age_range.get("max_age","")
            attrs_dict["Age_Range"] = "{}-{}".format(min_age, max_age)
        elif isinstance(age_range, list):
            num_list = list(map(int, num_list))
            min_age, max_age = min(num_list), max(num_list)
            attrs_dict["Age_Range"] = "{}-{}".format(min_age, max_age)
        elif isinstance(age_range, str):
            num_list = re.findall(r'\b\d{2}\b', age_range, re.IGNORECASE)
            num_list = list(map(int, num_list))
            min_age, max_age = min(num_list), max(num_list)
            attrs_dict["Age_Range"] = "{}-{}".format(min_age, max_age)
        else:
            attrs_dict["Age_Range"] = None

        # Ethnicity
        if isinstance(ethnicity, dict):
            attrs_dict["Ethnicity"] = ethnicity.get("Ethnicity", [])
        elif isinstance(ethnicity, list) or isinstance(ethnicity, str):
            attrs_dict["Ethnicity"] = ethnicity[:]
        else:
            attrs_dict["Ethnicity"] = []



        # skin_type
        if isinstance(skin_type, dict):
            attrs_dict["Skin_Type"] = skin_type.get("Skin_Type", [])
        elif isinstance(skin_type, list) or isinstance(skin_type, str):
            attrs_dict["Skin_Type"] = skin_type[:]
        else:
            attrs_dict["Skin_Type"] = []

        # Fitzpatrick, Skin and Lip
        attrs_dict = {**attrs_dict, "Fitzpatrick_Type": [], "Skin_Tone": [], "Lip_Type": []}
        try:
            if isinstance(fitzpatrick_skin_lip, dict):
                fitz_list = list(set(fitzpatrick_skin_lip.get("Fitzpatrick_Type", [])))
                fitz_list = list( filter(lambda x: bool(re.search(r'[I-VI0-9]', x)),  fitz_list) )
                attrs_dict["Fitzpatrick_Type"] = [re.sub(r'\bFitz(?:patrick)?\b', "Type", fitz_type) for fitz_type in fitz_list]
                attrs_dict["Skin_Tone"] = list(set(fitzpatrick_skin_lip.get("Skin_Tone", [])))
                attrs_dict["Lip_Type"] = list(set(fitzpatrick_skin_lip.get("Lip_Type", [])))
            elif isinstance(fitzpatrick_skin_lip, list):
                fitz_list = list( filter(lambda x: bool(re.search(r'[I-VI0-9]', x)),  fitzpatrick_skin_lip) )
                attrs_dict["Fitzpatrick_Type"] = [re.sub(r'\bFitz(?:patrick)?\b', "Type", fitz_type) for fitz_type in fitz_list]
            elif isinstance(fitzpatrick_skin_lip, str):
                # r'\b(?:Type\s*[IVX]{1,3}|Fitz(?:patrick)?\s*(?:[IVX]{1,3}|[1-6](?:/[1-6])?))\b'
                # fitz_list = list(set(re.findall(r"[I-VI0-6]", fitzpatrick_skin_lip, re.IGNORECASE)))
                # attrs_dict["Fitzpatrick_Type"] = [re.sub(r'\bFitz(?:patrick)?\b', "Type", fitz_type) for fitz_type in fitz_list]
                if bool(re.search(r'\[(.*)(\])?', fitzpatrick_skin_lip)):
                    fitz_list = list(set([fitz.strip() for fitz in re.search(r'\[(.*)(\])?', fitzpatrick_skin_lip).group(1).split(",") if fitz]))
                    fitz_list = [re.sub(r'"', '', fitz) for fitz in fitz_list]
                    attrs_dict["Fitzpatrick_Type"]  = list( filter(lambda x: bool(re.search(r'[I-VI0-9]', x)),  fitz_list) )
            else:
                pass
        except Exception as e:
            print("Exception during fitzpatrick_skin_lip response parsing")
        return attrs_dict



    def table_param_y_coord(self, pdf):
        '''
        Finds the page number and top left y-coordinate of results tables:
        Args:
            pdf: input json containing tables
        Returns:
            lst: a list of tuples
        '''
        table_indices = self._get_results_tables_with_indices(pdf["tables"])
        lst = []
        for ix in table_indices:
            table = pdf["tables"][ix]
            lst.append((None, table["bounding_regions"][0]["page_number"], table["bounding_regions"][0]["polygon"][0]["y"]))
        return lst


    def regex_content_search(self, page_paragraphs, regex_pattern_1, regex_pattern_2, regex_pattern_3, y0,page_num):
        '''
        searches for paragraphs with given regex patterns
        Args:
            page_paragraphs: A list of paragraph objects (from Azure Document Intelligence output json)
            regex_pattern_1: regex pattern to match
            regex_pattern_2: regex pattern to match
            regex_pattern_3: regex pattern to match
            y0_table: top left y-coordinate of the table
            page_num_table: page number of the table
        Returns:
            res_sec: A list of tuples (paragraph string, page number, top left y-coordinate of paragraph)
        '''
        res_sec = []
        for para in page_paragraphs:
            # if len(para["content"].split(' ')) > 10:
            #     continue
            if bool((re.search(regex_pattern_1, para["content"], re.IGNORECASE))):
                if (para["bounding_regions"][0]["polygon"][0]["y"] <= y0) or (para["bounding_regions"][0]['page_number'] < page_num):
                    possible_section_y = para["bounding_regions"][0]["polygon"][0]["y"]
                    if bool(re.search(regex_pattern_2, para["content"], re.IGNORECASE)):
                        res_sec.append((para["content"], para["bounding_regions"][0]["page_number"], para["bounding_regions"][0]["polygon"][0]["y"]))
                    else:
                        bbox_para_x = [corner['x'] for corner in para["bounding_regions"][0]["polygon"]]
                        box_para_y = [corner['y'] for corner in para["bounding_regions"][0]["polygon"]]
                        for para2 in page_paragraphs:
                            bbox_para2_x = [corner['x'] for corner in para2["bounding_regions"][0]["polygon"]]
                            bbox_para2_x = [corner['y'] for corner in para2["bounding_regions"][0]["polygon"]]
                            if bbox_para2_x == bbox_para_x and bbox_para2_y == bbox_para_y:
                                continue
                            current_y = para2["bounding_regions"][0]["polygon"][0]["y"]
                            if current_y == possible_section_y and bool(re.search(regex_pattern_3,para2['content'], re.IGNORECASE)):
                                res_sec.append((para2["content"], para2["bounding_regions"][0]["page_number"], para2["bounding_regions"][0]["polygon"][0]["y"]))
        return res_sec


    def get_section_heading_y_coords(self, paragraphs, page_num, y0_table, table_page_num):
        '''
        Gets a list of possible paragraphs which are section or subsection headings of results table
        Args:
            paragraphs: A list of paragrpah objects (from Azure Document Intelligence output json)
            page_num: page number in which to perform the search
            y0_table: top left y-coordinate of the table
            page_num_table: page number of the table
        Returns:
            list of possible headings for the table
        '''
        parameters_paragraphs = [para for para in paragraphs if para["bounding_regions"][0]["page_number"] == page_num]

        sec_flag = False
        site_flag = False
        sub_site_flag = False

        res_sec = self.regex_content_search(parameters_paragraphs, r"\b\d+\.0\b", r"\b\d+\.0\b.*?RESULTS", r"Results", y0_table, table_page_num)
        if len(res_sec)>0:
            sec_flag = True

        sites = self.regex_content_search(parameters_paragraphs, r"\b\d+\.[1-9]\b",
                                    r"\b\d+\.[1-9]\b.*?(skin|dermal|ocular) evaluation",
                                    r"(skin|dermal|ocular) evaluation", y0_table, table_page_num)
        if len(sites)>0:
            site_flag = True

        sub_sites = self.regex_content_search(parameters_paragraphs, r"\b\d+\.[1-9]\.[1-9]\b",
                                        r"\b\d+\.[1-9]\.[1-9]\b.*?[a-z]+",
                                        r"[a-z]+", y0_table, table_page_num)
        if len(sub_sites)>0:
            sub_site_flag = True

        if not (sec_flag or site_flag or sub_site_flag):
            return self.get_section_heading_y_coords(paragraphs, page_num-1, y0_table, table_page_num)
        else:
            if sub_site_flag:
                return sub_sites
            elif site_flag:
                return sites
            else:
                return None


    def sort_page_numbers_and_ordinates(self, arr, parameter_tup):
        """
        function to sort the list of tuples by the 2nd and 3rd value in ascending order
        """
        if not arr:
            return None
        for i, tup in enumerate(arr):
            if parameter_tup[1] > tup[1]:
                continue
            elif parameter_tup[1] <= tup[1] and parameter_tup[2] < tup[2]:
                arr.pop(i)
        return list(sorted(arr, key=lambda x: [x[1], x[2]]))


    def extract_test_sites(self, pdf):
        '''
        Function to extract the test sites for results tables
        '''
        param_y_coords = self.table_param_y_coord(pdf)
        test_sites = []
        for i, (content, page_num, y0) in enumerate(param_y_coords):
            tup = (content, page_num, y0)
            arr = self.get_section_heading_y_coords(pdf["paragraphs"], page_num, y0, page_num)
            possible_list = self.sort_page_numbers_and_ordinates(arr, tup)
            if possible_list:
                #print(possible_list)
                possible_test_site = possible_list[-1][0]
                site_str = re.sub(r"[^a-zA-Z\s]", "", possible_test_site).strip()
                if bool(re.search(r".*?evaluation", site_str, re.IGNORECASE)):
                    if bool(re.search(r"(skin|dermal|ocular).*?evaluation.*?of", site_str, re.IGNORECASE)):
                        test_site = site_str.split(" ")[-1]
                    else:
                        test_site = re.search(r"(skin|dermal|ocular|eye area|lip).*?evaluation", site_str, re.IGNORECASE).group(1) if bool(re.search(r"(skin|dermal|ocular|eye area|lip).*?evaluation", site_str, re.IGNORECASE)) else None
                        #print(test_site)
                    test_sites.append(test_site)
                else:
                    test_site = site_str
                    test_sites.append(test_site)
        return test_sites



In [None]:
import json
from time import perf_counter
from promptflow.tools.common import render_jinja_template, handle_openai_error, parse_chat, to_bool, \
    validate_functions, process_function_call, post_process_chat_api_response, init_azure_openai_client

# Avoid circular dependencies: Use import 'from promptflow._internal' instead of 'from promptflow'
# since the code here is in promptflow namespace as well
from promptflow._internal import enable_cache, ToolProvider, tool
from promptflow.connections import AzureOpenAIConnection
from promptflow.contracts.types import PromptTemplate


class AzureOpenAI:
    def __init__(self, connection: AzureOpenAIConnection):
        self.connection = connection
        self._client = init_azure_openai_client(connection)

    def calculate_cache_string_for_completion(self, **kwargs,) -> str:
        d = dict(self.connection)
        d.pop("api_key")
        d.update({**kwargs})
        return json.dumps(d)

    @handle_openai_error()
    @enable_cache(calculate_cache_string_for_completion)
    def completion(
        self,
        prompt: PromptTemplate,
        # for AOAI, deployment name is customized by user, not model name.
        deployment_name: str,
        suffix: str = None,
        max_tokens: int = 16,
        temperature: float = 1.0,
        top_p: float = 1.0,
        n: int = 1,
        # stream is a hidden to the end user, it is only supposed to be set by the executor.
        stream: bool = False,
        logprobs: int = None,
        echo: bool = False,
        stop: list = None,
        presence_penalty: float = 0,
        frequency_penalty: float = 0,
        best_of: int = 1,
        logit_bias: dict = {},
        user: str = "",
        **kwargs,
    ) -> str:
        prompt = render_jinja_template(prompt, trim_blocks=True, keep_trailing_newline=True, **kwargs)
        # TODO: remove below type conversion after client can pass json rather than string.
        echo = to_bool(echo)
        stream = to_bool(stream)
        response = self._client.completions.create(
            prompt=prompt,
            model=deployment_name,
            # empty string suffix should be treated as None.
            suffix=suffix if suffix else None,
            max_tokens=int(max_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            n=int(n),
            stream=stream,
            # TODO: remove below type conversion after client pass json rather than string.
            # empty string will go to else branch, but original api cannot accept empty
            # string, must be None.
            logprobs=int(logprobs) if logprobs else None,
            echo=echo,
            # fix bug "[] is not valid under any of the given schemas-'stop'"
            stop=stop if stop else None,
            presence_penalty=float(presence_penalty),
            frequency_penalty=float(frequency_penalty),
            best_of=int(best_of),
            # Logit bias must be a dict if we passed it to openai api.
            logit_bias=logit_bias if logit_bias else {},
            user=user,
            extra_headers={"ms-azure-ai-promptflow-called-from": "aoai-tool"})

        if stream:
            def generator():
                for chunk in response:
                    if chunk.choices:
                        yield chunk.choices[0].text if hasattr(chunk.choices[0], 'text') and \
                               chunk.choices[0].text is not None else ""

            # We must return the generator object, not using yield directly here.
            # Otherwise, the function itself will become a generator, despite whether stream is True or False.
            return generator()
        else:
            # get first element because prompt is single.
            return response.choices[0].text

    @handle_openai_error()
    def chat(
        self,
        chat_str: str, # prompt: PromptTemplate,
        # for AOAI, deployment name is customized by user, not model name.
        deployment_name: str,
        temperature: float = 1.0,
        top_p: float = 1.0,
        n: int = 1,
        # stream is hidden to the end user, it is only supposed to be set by the executor.
        stream: bool = False,
        stop: list = None,
        max_tokens: int = None,
        presence_penalty: float = 0,
        frequency_penalty: float = 0,
        logit_bias: dict = {},
        user: str = "",
        # function_call can be of type str or dict.
        function_call: object = None,
        functions: list = None,
        response_format: object = None,
        seed: int = None,
        # **kwargs,
    ) -> [str, dict]:
        # keep_trailing_newline=True is to keep the last \n in the prompt to avoid converting "user:\t\n" to "user:".
        # chat_str = render_jinja_template(prompt, trim_blocks=True, keep_trailing_newline=True, **kwargs)
        messages = parse_chat(chat_str)
        # TODO: remove below type conversion after client can pass json rather than string.
        stream = to_bool(stream)
        params = {
            "model": deployment_name,
            "messages": messages,
            "temperature": float(temperature),
            "top_p": float(top_p),
            "n": int(n),
            "stream": stream,
            "presence_penalty": float(presence_penalty),
            "frequency_penalty": float(frequency_penalty),
            "user": user,
            "extra_headers": {"ms-azure-ai-promptflow-called-from": "aoai-tool"}
        }
        if functions is not None:
            validate_functions(functions)
            params["functions"] = functions
            params["function_call"] = process_function_call(function_call)

        # to avoid vision model validation error for empty param values.
        if stop:
            params["stop"] = stop
        if max_tokens is not None and str(max_tokens).lower() != "inf":
            params["max_tokens"] = int(max_tokens)
        if logit_bias:
            params["logit_bias"] = logit_bias
        if response_format:
            params["response_format"] = response_format
        if seed is not None:
            params["seed"] = seed

        completion = self._client.chat.completions.create(**params)
        return completion #post_process_chat_api_response(completion, stream, functions)


# register_apis(AzureOpenAI)


@tool
def chat(
    connection: AzureOpenAIConnection,
    chat_str: str, # prompt: PromptTemplate,
    deployment_name: str,
    max_tokens: int
) -> str:
    # chat model is not available in azure openai, so need to set the environment variable.
    gen_params = {
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "stream": False,
        "stop": None,
        "max_tokens": max_tokens,
        "presence_penalty": 0.0,
        "frequency_penalty": 0.0,
        "logit_bias": {},
        "user": "",
        "function_call": None,
        "functions": None,
        "response_format": {"type": "text"},
        "seed": None
    }

    start_time = perf_counter()
    completion = AzureOpenAI(connection).chat(
        chat_str=chat_str,
        deployment_name=deployment_name,
        **gen_params
    )
    end_time = perf_counter()
    op_dict = {}
    op_dict["response"] = completion.choices[0].message.content
    op_dict["token_stats"] = {
        "prompt_tokens": completion.usage.prompt_tokens,
        "completion_tokens": completion.usage.completion_tokens,
        "total_tokens": completion.usage.total_tokens
    }
    op_dict["openai_exec_time"] = end_time - start_time

    return op_dict


In [None]:
from promptflow.core import tool
from promptflow.connections import AzureOpenAIConnection
from promptflow.connections import CognitiveSearchConnection
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
import requests
from azure.core.credentials import AzureKeyCredential
import pdb


def get_query_embedding(query, endpoint, api_key, api_version, embedding_model_deployment):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    request_payload = {
        'input': query
    }
    embedding_response = requests.post(request_url, json=request_payload, headers=headers, timeout=None)
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")


@tool
def acs_retriever(
    queries: str,
    searchConnection: CognitiveSearchConnection,
    indexName: str,
    topK: int,
    embeddingModelConnection: AzureOpenAIConnection,
    embeddingModelName: str,
    vectorColName: str,
    searchType: str,
    semantic_config_name: str,
    query_type: str,
    is_greeting: int
):
    #  searchType options: "filter", "vector", "hybrid", filter_vector", "filter_hybrid"

    if int(is_greeting):
        return []

    embeddingModelName = embeddingModelName if embeddingModelName != None else None

    if not queries.endswith("?"):
        queries += "?"

    search_client = SearchClient(
        endpoint=searchConnection["api_base"],
        index_name=indexName,
        credential=AzureKeyCredential(searchConnection["api_key"]),
    )

    searchType = searchType.lower()

    # if searchType == "filter_vector" or searchType == "filter_hybrid" or searchType == "filter":
    #     filter_str = " and ".join(f"({key} eq '{value}')" for key, value in filterCol.items())
    #     filter_str = f"({filter_str})"
    # else:
    #     filter_str = None

    filter_str = None

    if searchType == "vector" or searchType == "hybrid" or searchType == "filter_vector" or searchType == "filter_hybrid":
        queryEmbedding = get_query_embedding(
            query=queries,
            endpoint=embeddingModelConnection["api_base"],
            api_key=embeddingModelConnection["api_key"],
            api_version=embeddingModelConnection["api_version"],
            embedding_model_deployment=embeddingModelName
        )[0]

        vector_query = VectorizedQuery(vector=queryEmbedding, k_nearest_neighbors=50, fields=vectorColName, weights=0.65)
    else:
        vector_query=None

    if searchType == "filter":
        results = search_client.search(
            search_text=None,
            vector_queries=None,
            filter=filter_str,
            top=topK,
        )
    elif searchType == "vector" or searchType == None:
        results = search_client.search(
            search_text=None,
            vector_queries=[vector_query],
            filter=None,
            top=topK,
        )
    elif searchType == "hybrid":
        results = search_client.search(
            search_text=queries,
            vector_queries=[vector_query],
            query_type=query_type,
            semantic_configuration_name=semantic_config_name,
            filter=None,
            top=topK,
            select=['blob_location', 'context']
        )
    elif searchType == "filter_vector":
        results = search_client.search(
            search_text=None,
            vector_queries=[vector_query],
            filter=filter_str,
            top=topK,
        )
    elif searchType == "filter_hybrid":
        results = search_client.search(
            search_text=queries,
            vector_queries=[vector_query],
            filter=filter_str,
            top=topK,
        )
    else:
        raise ValueError('Please choose valid searchType from: ["filter", "vector", "hybrid", "filter_vector", "filter_hybrid"]')
    output = [result for result in results]

    return output


In [None]:
import os
import json
from promptflow.core import tool


# def estimate_tokens(text: str) -> int:
#   return (len(text) + 2) / 3


@tool
def format_retrieved_documents(docs: list, maxTokens: int) -> str:
  formattedDocs = []
  strResult = ""
  for index, doc in enumerate(docs):
    sourceFile = doc['blob_location'].split("/")[-1].replace(r"%20", " ")
    formattedDocs.append({
      f"[doc{index}]": {
        "title": sourceFile,
        "content": doc["context"] #summarized_docs[index],
      }
    })
    formattedResult = { "retrieved_documents": formattedDocs }
    nextStrResult = json.dumps(formattedResult)
    # if (estimate_tokens(nextStrResult) > maxTokens):
    #   break
    strResult = nextStrResult

  if strResult == "":
    return json.dumps({"retrieved_documents": []})

  return strResult


In [None]:
import json
from promptflow.core import tool


def format_turn(speaker: str, message: str) -> str:
    return f"{speaker}:\n{message}\n"


def estimate_tokens(text: str) -> int:
    return (len(text) + 2) / 3


@tool
def format_conversation(history: list, maxTokens: int) -> str:
    result = ""
    conversation_history = []
    for history_item in history:
        conversation_history.append({
            "speaker": "user",
            "message": history_item["inputs"]["query"]
        })
        conversation_history.append({
            "speaker": "assistant",
            "message": history_item["outputs"]["response"]
        })

    # Start using context from history, starting from most recent, until token limit is reached.
    for turn in reversed(conversation_history):
        turnStr = format_turn(turn["speaker"], turn["message"])
        newResult = turnStr + result
        if estimate_tokens(newResult) > maxTokens:
            break
        result = newResult
    return result


In [None]:
from promptflow.core import tool
import re
import json


@tool
def add_citations(docs: list, llm_response: str) -> dict:

    doc_urls = [doc['blob_location'] for doc in docs]

    # Find and replace references in the message
    formatted_message = llm_response
    citation_list = []
    filenames = []
    doc_list = []
    urls = []

    # doc_dict = {element: index for index, element in enumerate(et(doc_urls[doc_index]))}
    for match in re.finditer(r'\[doc\+?(\d+)\]', llm_response):
        doc_index = int(match.group(1))

        if doc_urls[doc_index] not in doc_list:
            doc_list.append(doc_urls[doc_index])
            ref_number = len(citation_list) + 1
            # formatted_message = formatted_message.replace(f'[doc{doc_index}]', f'[{ref_number}]')
            # formatted_message = formatted_message.replace(f'[doc{doc_index}]', '')
            formatted_message = re.sub(rf'\[doc\+?{doc_index}\]', "", formatted_message)
            # Fetch URL by index, use '#' if index out of range
            filepath = f'{doc_urls[doc_index] if doc_index < len(doc_urls) else "#"}'
            filename = filepath.split("/")[-1].replace(r"%20", " ")
            filenames.append(filename)
            link = f'{filepath}'
            # link =f'<a href=['{filename}']>{{doc_urls[doc_index] if doc_index < len(doc_urls) else "#"}}</a>'
            url = f'<a href="{link}">{filename}</a>'
            urls.append(url)
            citation_list.append(f'\n{ref_number}.{url}')
        else:
            formatted_message = re.sub(rf'\[doc\+?{doc_index}\]', "", formatted_message)

    # Add citations at the end of the message
    # formatted_message += '\n\n' + '\n'.join(citation_list)

    op = {
        "chat_output": formatted_message,
        "documents": [{"filename": x, "link": y} for x, y in zip(filenames, urls)],
    }

    return op

In [None]:
# system:
# You are an assistant that reformulates questions into standalone questions when necessary.

# -  If the user provides a greeting (e.g., "hi", "hello", "how are you?", "how's your day going?", "nice to meet you"), expresses gratitude (e.g., "thank you", "thanks a lot", "appreciate it"), or uses acknowledgment phrases (e.g. "okay", "sure", "got it", "makes sense"), do not reformulate it. then
#     * Ignore the chat history.
#     * do not reformulate greetings or treat them as queries and return it as it is.
# - If the user's follow up question is independent of the chat history, return it as-is.
# - If the user's follow up question depends on prior context, rephrase it into a standalone question that includes the relevant details from the chat history.
# - If the user's follow up question is combination of two or more questions, then rephrase the question such that the standalone question is representative of all the questions in the follow up question while keeping above points in check
# - Map the following terms or any other synonymous terms (if encountered in the follow up question) to `MET Owner`:
#     - Material Portfolio Owner
#     - MET Leader
#     - MET Owner
#     **Note**: if you find the term `portfolio` at the end of the question then remove it.


# Do not add more details than necessary to the standalone question.
# Your response format should be as below:
# {"rewritten_query": reformulated query here or the greeting, "is_greeting": 1, if the user provided a greeting else 0}

# ## Chat history:
# {% for item in chat_history %}
# user:
# {{ item.inputs.query}}
# assistant:
# {{ item.outputs.response }}
# {% endfor %}

# ## Follow up Question:
# {{ query }}

# ## Your Response in JSON format: