In [None]:
from openai import AzureOpenAI
import pandas as pd
import numpy as np
import os
import rispy
import time
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
import vertexai.preview.generative_models as generative_models
from anthropic import AnthropicVertex
from tqdm.notebook import tqdm
import replicate

In [None]:
#API keys should be set as env variables, but hardcoded here for demonstration purposes
def set_up_api(service):
    if service=="openai":
        client = AzureOpenAI(
        api_key = "API_KEY",  
        api_version = "XXXX-XX-XX",
        azure_endpoint = "https://X.openai.azure.com/"
        )
    if service=="anthropic-vertex":
        client = AnthropicVertex(region="REGION", project_id="PROJECT_ID")

    if service=="google-vertex":
        vertexai.init(project="PROJECT_ID", location="REGION")
        model = GenerativeModel("MODEL_NAME")

    if service=="meta-replicate":
        os.environ['REPLICATE_API_TOKEN'] = "REPLICATE_API_TOKEN"


In [None]:
#Parse RIS files and standardise column names
def parse_ris_to_df(file_path):
    with open(file_path, 'r') as file:
        entries = rispy.load(file)
        df = pd.DataFrame(entries)
        if 'title' in df.columns and 'abstract' in df.columns:
            df = df[['title', 'abstract']]
        elif 'primary_title' in df.columns and 'notes_abstract' in df.columns:
            df = df[['primary_title', 'notes_abstract']]
            df.columns = ['title', 'abstract']
        return df

In [None]:
ris_dir = "RIS_DIRECTORY_PATH"
#Forming inclusion list of RIS files
SR_list = ["bellon", "buchan", "clezar", "cutting", "dopper", "ghoraba", "hjetland", "karkou", "lin", "lynch", "malik", "mohamed", "roy", "santos", "setthawong", "sevaux", "singh_glasses", "singh_pemphigoid", "sulewski", "sulistyo", "white", "younis", "zhu"]
SR_all = []
SR_includes = []
for i in SR_list:
    SR_all.append(i + ".ris")
for i in SR_list:
    SR_includes.append(i + "-includes.ris")
for file in SR_includes:
    file_path = os.path.join(ris_dir, file)
    if not os.path.exists(file_path):
        print(f"{file} is missing")

review_dict = {}
for review, path in zip(SR_list, paths_all):
    df = parse_ris_to_df(path)
    df['truth'] = ''
    df['decision'] = ''
    df['output'] = ''
    review_dict[review] = df

includes_dict = {}
for review, path in zip(SR_list, paths_includes):
    df = parse_ris_to_df(path)
    includes_dict[review] = df

In [None]:
#Dataset curation
#Missing abstract analysis
missing_abstract_percentages = {}
for review, df in review_dict.items():
    df['abstract'].replace("", np.nan, inplace=True)
    df['decision'].replace("", np.nan, inplace=True)
    df['output'].replace("", np.nan, inplace=True) 
    total_titles = len(df)
    print(review)
    print(total_titles)
    missing_abstracts = df['abstract'].isnull().sum()
    print(missing_abstracts)
    missing_abstract_percentage = (missing_abstracts / total_titles) * 100
    missing_abstract_percentages[review] = missing_abstract_percentage
print(missing_abstract_percentages)

truth_dict = {}
x=0
for review,df in includes_dict.items():
    truth_dict[review] = df.shape[0]
    x+=df.shape[0]
truth_dict

In [None]:
# Dictionary of systematic review titles
title_dict = {
    'bellon':'Perioperative glycaemic control for people with diabetes undergoing surgery',
    'buchan':'Medically assisted hydration for adults receiving palliative care',
    'clezar':'Pharmacological interventions for asymptomatic carotid stenosis',
    'cutting':'Intracytoplasmic sperm injection versus conventional in vitro fertilisation in couples with males presenting with normal total sperm count and motility',
    'dopper':'High flow nasal cannula for respiratory support in term infants',
    'ghoraba':'Pars plana vitrectomy with internal limiting membrane flap versus pars plana vitrectomy with conventional internal limiting membrane peeling for large macular hole',
    'hjetland':'Vocabulary interventions for second language (L2) learners up to six years of age',
    'karkou':'Dance movement therapy for dementia',
    'lin': 'Hyperbaric oxygen therapy for late radiation tissue injury',
    'lynch':'Interventions for the uptake of evidence‐based recommendations in acute stroke settings',
    'malik':'Fibrin‐based haemostatic agents for reducing blood loss in adult liver resection',
    'mohamed':'Prostaglandins for adult liver transplanted recipients',
    'roy':'Interventions for chronic kidney disease in people with sickle cell disease',
    'santos':'Prophylactic anticoagulants for non‐hospitalised people with COVID‐19',
    'setthawong':'Extracorporeal shock wave lithotripsy (ESWL) versus percutaneous nephrolithotomy (PCNL) or retrograde intrarenal surgery (RIRS) for kidney stones',
    'sevaux':'Paracetamol (acetaminophen) or non‐steroidal anti‐inflammatory drugs, alone or combined, for pain relief in acute otitis media in children',
    'singh_glasses':'Blue‐light filtering spectacle lenses for visual performance, sleep, and macular health in adults',
    'singh_pemphigoid':'Interventions for bullous pemphigoid',
    'sulewski':'Topical ophthalmic anesthetics for corneal abrasions',
    'sulistyo':'Enteral tube feeding for amyotrophic lateral sclerosis/motor neuron disease',
    'white':'Oxygenation during the apnoeic phase preceding intubation in adults in prehospital, emergency department, intensive care and operating theatre environments',
    'younis':'Hydrogel dressings for donor sites of split‐thickness skin grafts',
    'zhu':'Expanded polytetrafluoroethylene (ePTFE)‐covered stents versus bare stents for transjugular intrahepatic portosystemic shunt in people with liver cirrhosis'
}
# Inclusion criteria for each systematic review
incl_criteria_dict = {
    'bellon': (
        "Population: Any person with type 1 or type 2 diabetes mellitus, who is perioperative"
        "Intervention: perioperative glycaemic control"
        "Control: usual care"
        "Study design: Randomised control trials"
    ),
    'buchan': (
        "Population: Adults age 18 years and older receiving palliative care (i.e. with a limited prognosis) in any setting (e.g. home, hospice, hospital)"
        "Intervention: medically assisted hydration (non-nutritional fluids, via subcutaneous, venous system, or enterally)"
        "Control: placebo or standard care"
        "Study design: Randomised control trials"
    ),
    'clezar': (
        "Population: all persons with asymptomatic carotid stenosis (narrowing of the internal and/or common carotid artery)"
        "Intervention: a single or combination of pharmacological interventions, including anticoagulant, antiplatelet, antihypertensive, glycaemic-lowering and/or lipid-lowering drugs"
        "Control: placebo, no treatment, or another pharmacological intervention"
        "Study design: all randomised control trials, with parallel or cross-over design"
    ),
    'cutting': (
        "Population: couples with males presenting with normal total sperm count and motility"
        "Intervention: intracytoplasmic sperm injection"
        "Control: conventional in-vitro fertilisation"
        "Study design: Randomised control trials"
    ),
    'dopper': (
        "Population: infants greater or equal to 37 weeks gestational age, up to neonatal period (one month post-natal)"
        "Intervention: high flow nasal cannula oxygen therapy (flow rates greater than 2L/min)"
        "Control: continuous positive airway pressure, low flow nasal cannula oxygen therapy (flow rates less than 2L/min),"
        "Study design: Prospective randomised control trials"
    ),
    'ghoraba': (
        "Population: people with large macular holes, regardless of cause and lens status"
        "Intervention: pars plana vitrectomy with inverted internal limiting membrane flap technique"
        "Control: pars plana vitrectomy with conventional internal limiting membrane peeling"
        "Study design: Randomised control trials"
    ),
    'hjetland': (
        "Population: children aged 5 years and 11 months or younger at pretest, including those with developmental language disorders"
        "Intervention: any vocabulary intervention aimed to enhance second language vocabulary skills in any educational setting"
        "Control: standard care or inactive control conditions"
        "Study design: Randomised control trials, including cluster or individual randomisations"
    ),
    'karkou': (
        "Population: people formally diagnosed as having any type of dementia of any severity, according to ICD-11, or DSM-5 or other diagnostic criteria, in all age groups and settings"
        "Intervention: any dance therapy delivered by dance movement therapists with therapeutic intent to groups, individuals, or families"
        "Control: no treatment, standard care, other psychological therapies, pharmacological interventions, or any other therapy (including different types of dance movement therapy)"
        "Study design: Randomised control trials, including cross-over designs and cluster-RCTs, in any language"
    ),
    'lin': (
        "Population: Any person with late radiation tissue injury (including necrosis) of any tissue type. Also anyone treated with large-dose radiotherapy likely to induce early necrosis."
        "Intervention: any treatment regimens including hyperbaric oxygen therapy"
        "Control: no treatment, sham treatment, or treatment regimens without hyperbaric oxygen therapy"
        "Study design: Randomised trials, including RCTs and quasi-RCTs"
    ),
    'lynch': (
        "Population: Care provided by any healthcare professional working on a stroke unit, and/or for patients admitted with acute stroke, within 7 days of onset (including mixed diagnostic groups)."
        "Intervention: any single or multifaceted intervention aimed at enhancing adherence to evidence-based recommendations. This includes delivery arrangements, financial arrangements, governance arrangements as defined by EPOC taxonomy"
        "Control: no intervention/standard care, single active control intervention, multifaceted intervention"
        "Study design: Randomised trials or cluster-randomised trial, with at least two intervention and two control sites"
    ),
    'malik': (
        "Population: adults ages 18 years or older undergoing liver resection (regardless of site, extent of resection, and underlying liver pathology)"
        "Intervention: any commercial or non-commercial fibrin-based haemostatic agent, regardless of additive (co-interventions allowed)"
        "Control: intervention, placebo, non-fibrin-based haemostatic agent"
        "Study design: Randomised clinical trials"
    ),
    'mohamed': (
        "Population: Adult participants (aged 18 years or older) undergoing liver transplantation surgery"
        "Intervention: prostaglanding E1 or E2 initiated in the perioperative period"
        "Control: placebo or standard (routine) care"
        "Study design: Randomised clinical trial with parallel group design"
    ),
    'roy': (
        "Population: people with all types of sickle cell disease, of any age and sex"
        "Intervention: all interventions for sickle cell disease, including red blood cell transfusions, hydroxyurea, and ACE-inhibitors"
        "Control: placebo, standard care, or comparison between active interventions for sickle cell disease"
        "Study design: Randomised controlled trial"
    ),
    'santos': (
        "Population: non-hospitalised participants of any sex and age with COVID-19 diagnosis (confirmed by PCR). Include participants who had been previously hospitalised, and participants with previous venous thromboembolism diagnosis (if treatment had finished)"
        "Intervention: prophylactic anticoagulants (including co-treatments)"
        "Control: placebo, no treatment, pharmacological (active) comparator including a different anticoagulant or regimen, or non-pharmacological comparator"
        "Study design: Randomised controlled trial (including parallel, cluster, individual, cross-over)"
    ),
    'setthawong': (
        "Population: people with renal stones of any size at any location in the kidney"
        "Intervention: Extracorporeal shock wave lithotripsy (ESWL) for renal stones (with co-treatments)"
        "Control: percutaneous nephrolithotomy (PCNL) or retrograde intrarenal surgery (RIRS)"
        "Study design: Randomised controlled trial and quasi-RCTs"
    ),
    'sevaux': (
        "Population: children aged from 6 months to 17 years with acute otitis media (also including children with fever or upper respiratory tracct infection)"
        "Intervention: trials of paracetamol and NSAIDs administered orally or rectally, either alone or in combination (with additional co-treatments)"
        "Control: placebo, or active control (including NSAID or paracetamol monotherapy)"
        "Study design: Randomised controlled trial"
    ),
    'singh_glasses': (
        "Population: adult participants (at least 18 years of age)"
        "Intervention: blue-light filtering spectacle lenses (including co-treatments)"
        "Control: non-blue-light filtering spectacle lenses"
        "Study design: Randomised controlled trial"
    ),
    'singh_pemphigoid': (
        "Population: people of any age who have received treatment for a diagnosis of bullous pemphigoid (confirmed by immunofluorescence studies)"
        "Intervention: any therapeutic intervention used to treat bullous pemphigoid"
        "Control: placebo"
        "Study design: Randomised controlled trials, including cluster-RCTs, cross-over RCTs, and multiple-arm trials"
    ),
    'sulewski': (
        "Population: participants of all ages who had corneal abrasions within 48 hours of presentation, of any cause"
        "Intervention: topical ophthalmic anaesthetics (amide or ester class), with co-treatments including NSAIDs"
        "Control: placebo, non-treatment, or alternative treatment including non-amide and non-ester treatments"
        "Study design: Randomised controlled trials"
    ),
    'sulistyo': (
        "Population: participants diagnosed with definite, possible, probable, or probable-laboratory supported amyotrophic lateral sclerosis (according to El Escorial or revised El Escorial criteria)"
        "Intervention: placement of enteral tube feeding (NG tube, PEG, PEG-J, surgical gastrostomy, PEJ, surgical jejunostomy, PRG/RIG, and PIG) at any time during course of ALS, with or without continued oral feeding"
        "Control: no enteral tube feeding and continued oral intake"
        "Study design: Randomised controlled trials, quasi-randomised controlled trials, cross-over trials"
    ),
    'white': (
        "Population: adults aged 18 years or older requiring intubation in prehospital, ED, ICU, and OT environments"
        "Intervention: any form of apnoeic oxygenation (including high-flow and low-flow nassual cannulae) prior to"
        "Control: no apnoeic oxygenationd prior to intubation"
        "Study design: Randomised controlled trial and quasi-RCTs (including not strictly randomised studies)"
    ),
    'younis': (
        "Population: People of any age with one or more donor site wounds, with split-thickness skin grafts"
        "Intervention: any hydrogel dressing"
        "Control: no dressing, or any other dressing with or without topical agents"
        "Study design: Randomised controlled trial, cluster-RCTs, split-body trials"
    ),
    'zhu': (
        "Population: adults (18 years or older) diagnosed with liver cirrhosis either by liver biopsy or typical clinical signs, regardless of aetiology, manifestation, or severity"
        "Intervention: Expanded polytetrafluoroethylene-covered stents, including Viatorr stents, Fluency stents, or a combination of both, in TIPS. Co-interventions allowed"
        "Control: bare metal stents in TIPS"
        "Study design: Randomised controlled trial, including parallel group designs"
    )
}
# Exclusion criteria for each systematic review
excl_criteria_dict = {
    'bellon': (
        "1. Paediatric populations (under 18 years old)"
        "2. Participants with interfering co-morbidities"
        "3. Emergency surgery & off-pump cardiac surgery"
        "4. Study designs other than randomised-control trials"
    ),
    'buchan': (
        "1. Participants having medically assisted hydration as part of a perioperative, chemotherapy, or radiotherapy regimen (or because of chemotherapy/radiotherapy adverse effects)"
    ),
    'clezar': (
        "1. No pharmacological intervention"
    ),
    'cutting': (
        "1. Sibling oocyte studies"
        "2. Couples with severe male infertility"
    ),
    'dopper': (
        "1. Quasi-RCTs"
        "2. Preterm infants below 37 completed gestational weeks, or infants older than one month postnatal age"
    ),
    'ghoraba': (
        "1. Post-operative follow-up of less than 3 months"
    ),
    'hjetland': (
        "1. Quasi-experimental studies, within-subjects studies"
        "2. Studies involving children of 6 years or older at pretest"
        "3. Studies involving participants with learning or developmental disorders (other than developmental language disorders)"
        "4. active control interventions"
    ),
    'karkou': (
        "No exclusion criteria specified"
    ),
    'lin': (
        "No exclusion criteria specified"
    ),
    'lynch': (
        "1. Any study not of randomised or cluster-randomised trial design"
        "2. Any study reporting care on rehabilitation stroke units (accepting patients more than 7 days after onset)"
        "3. Any study only investigating the intervention of organised care provided in inpatient stroke units"
        "4. Any study not investigating or reporting adherence"
    ),
    'malik': (
        "No exclusion criteria specified"
    ),
    'mohamed': (
        "1 Pseudo-randomised studies (i.e. quasi-randomised studies)"
    ),
    'roy': (
        "1. Exclude cross-over trials"
    ),
    'santos': (
        "1. People receiving treatment for current venous thromboembolism"
        "2. Participants with COVID-19 who are currently hospitalised (unless study also investigates a population of non-hospitalised patients)"
    ),
    'setthawong': (
        "1. Studies in children (under 14 years old) and pregnant women"
    ),
    'sevaux': (
        "1. Studies on children with grommets (ventilation tubes or tympanostomy tubes)"
    ),
    'singh_glasses': (
        "1. Studies with first follow-up at more than 3 months"
    ),
    'singh_pemphigoid': (
        "1. Studies involving participants with various dermatoses, if not clear group with bullous pemphigoid"
    ),
    'sulewski': (
        "1. Trials in which participants given topical anaesthetics only once after trauma- or surgery-induced abrasion"
    ),
    'sulistyo': (
        "No exclusion criteria specified"
    ),
    'white': (
        "No exclusion criteria specified"
    ),
    'younis': (
        "1 Quasi-randomised, cross-over, and split-wound trials"
    ),
    'zhu': (
        "1 Quasi-randomised studies and observational studies"
        "2 Trials with participants who were diagnosed with hepatocellular carcinoma or other malignant disease before the TIPS procedure"
    )
}

In [None]:
def get_messages(bias_level, SR_title, incl_criteria, excl_criteria, title, abstract):
    # Define prompts based on bias level 
    if bias_level == 'none':
        return [
            {"role": "system", "content": f"You are screening articles for inclusion in the systematic review titled '{SR_title}'."},
            {"role": "system", "content": f"Using the inclusion criteria for the systematic review provided below, decide if the screened articles should be included based on their title and abstract."},
            {"role": "system", "content": f"Respond only with either 'Include' or 'Exclude'"},
            
            {"role": "user", "content": f"Systematic Review Title: {SR_title}"},
            {"role": "user", "content": f"Inclusion Criteria (PICOS): {incl_criteria}"},
            {"role": "user", "content": f"Exclusion Criteria: {excl_criteria}"},
            {"role": "user", "content": f"Article title: {title}"},
            {"role": "user", "content": f"Article abstract: {abstract}"}
        ]
    elif bias_level == 'mild':
        return [
            {"role": "system", "content": f"You are screening articles for inclusion in the systematic review titled '{SR_title}'."},
            {"role": "system", "content": f"Using the inclusion criteria for the systematic review provided below, decide if the screened articles should be included based on their title and abstract."},
            {"role": "system", "content": f"Where relevance to the systematic review title is apparent, and the article meets most inclusion criteria, favour inclusion unless clear exclusion criteria are met."},
            {"role": "system", "content": f"Respond only with either 'Include' or 'Exclude'"},

            {"role": "user", "content": f"Systematic Review Title: {SR_title}"},
            {"role": "user", "content": f"Inclusion Criteria (PICOS): {incl_criteria}"},
            {"role": "user", "content": f"Exclusion Criteria: {excl_criteria}"},
            {"role": "user", "content": f"Article title: {title}"},
            {"role": "user", "content": f"Article abstract: {abstract}"}
        ]
    elif bias_level == 'moderate':
        return [
            {"role": "system", "content": f"You are screening articles for inclusion in the systematic review titled '{SR_title}'."},
            {"role": "system", "content": f"Using the inclusion criteria for the systematic review provided below, decide if the screened articles should be included based on their title and abstract."},
            {"role": "system", "content": f"If the article seems likely to be relevant to the systematic review's theme, and it does not explicitly meet any exclusion criteria, lean towards inclusion."},
            {"role": "system", "content": f"Respond only with either 'Include' or 'Exclude'"},

            {"role": "user", "content": f"Systematic Review Title: {SR_title}"},
            {"role": "user", "content": f"Inclusion Criteria (PICOS): {incl_criteria}"},
            {"role": "user", "content": f"Exclusion Criteria: {excl_criteria}"},
            {"role": "user", "content": f"Article title: {title}"},
            {"role": "user", "content": f"Article abstract: {abstract}"}
        ]
    elif bias_level == 'heavy':
        return [
            {"role": "system", "content": f"You are screening articles for inclusion in the systematic review titled '{SR_title}'."},
            {"role": "system", "content": f"Using the inclusion criteria for the systematic review provided below, decide if the screened articles should be included based on their title and abstract."},
            {"role": "system", "content": f"If there is any uncertainty if an article fulfils the inclusion criteria, but the abstract appears relevant to the title of the systematic review, lean towards including the article, as long as no exclusion criteria are met."},
            {"role": "system", "content": f"Respond only with either 'Include' or 'Exclude'"},

            {"role": "user", "content": f"Systematic Review Title: {SR_title}"},
            {"role": "user", "content": f"Inclusion Criteria (PICOS): {incl_criteria}"},
            {"role": "user", "content": f"Exclusion Criteria: {excl_criteria}"},
            {"role": "user", "content": f"Article title: {title}"},
            {"role": "user", "content": f"Article abstract: {abstract}"}
        ]
    elif bias_level == 'extreme':
        return [
            {"role": "system", "content": f"You are screening articles for inclusion in the systematic review titled '{SR_title}', with heavy preference towards inclusion."},
            {"role": "system", "content": "Your primary goal is to include articles unless there are overwhelming reasons to exclude."},
            {"role": "system", "content": "Apply the inclusion criteria very loosely and the exclusion criteria extremely strictly. Assume relevance unless proven otherwise beyond any reasonable doubt."},
            {"role": "system", "content": "Prioritise maximum sensitivity. Exclude only if the article is unequivocally irrelevant or explicitly violates multiple exclusion criteria."},
            {"role": "system", "content": "Respond only with 'Include' or 'Exclude'."},
            
            {"role": "user", "content": f"Systematic Review Title: {SR_title}"},
            {"role": "user", "content": f"Inclusion Criteria: {incl_criteria}"},
            {"role": "user", "content": f"Exclusion Criteria: {excl_criteria}"},
            {"role": "user", "content": f"Article Title: {title}"},
            {"role": "user", "content": f"Abstract: {abstract}"}
        ]
    elif bias_level == 'title':
        return [
            {"role": "system", "content": f"You are screening articles for inclusion in the systematic review titled '{SR_title}'."},
            {"role": "system", "content": f"Using the systematic review title, decide if the screened articles should be included based on the relevance of their title and abstract."},
            {"role": "system", "content": f"If there is any uncertainty, lean towards including the article"},
            {"role": "system", "content": f"Respond only with either 'Include' or 'Exclude'"},

            {"role": "user", "content": f"Systematic Review Title: {SR_title}"},
            {"role": "user", "content": f"Article title: {title}"},
            {"role": "user", "content": f"Article abstract: {abstract}"}
        ]
    else:
        raise ValueError("Invalid bias level specified")

In [None]:
#Combined dataframe generation
def generate_df(sample):
    #Generation of sample dataframe with 800 entries, seeded for reproducibility
    if sample == True:
        df_all = pd.DataFrame()
        for review, df in includes_dict.items():
            df['review'] = review
            df['decision'] = ''
            df['truth'] = 'Include'
            df_all = pd.concat([df_all, df])
        for review, df in review_dict.items():
            if df.shape[0] > 32:
                random_entries = df[df['abstract'].notnull()].iloc[32:].sample(n=23, random_state=42)
                random_entries['review'] = review
                random_entries['decision'] = ''
                random_entries['truth'] = 'Exclude'
                df_all = pd.concat([df_all, random_entries])
        df_all = df_all.reset_index(drop=True)
        return df_all
    #Generation of full dataframe
    elif sample == False:
        df_all = pd.DataFrame()
        for review, df in includes_dict.items():
            df['review'] = review
            df['decision'] = ''
            df['truth'] = 'Include'
            df_all = pd.concat([df_all, df])
        column_names = ['title', 'abstract', 'review', 'decision', 'truth']
        if 'sulistyo' in includes_dict and includes_dict['sulistyo'].empty:
            includes_dict['sulistyo'] = pd.DataFrame(columns=column_names)
        rows_to_add = []
        for review, df in review_dict.items():
            if review in includes_dict:
                included_titles = set(includes_dict[review]['title'])
            else:
                included_titles = set()
            for index, row in df[df['abstract'].notnull()].iterrows():
                if row['title'] not in included_titles:
                    row['review'] = review
                    row['decision'] = ''
                    row['output'] = ''
                    row['truth'] = 'Exclude'
                    rows_to_add.append(row)
        if rows_to_add:
            df_rows_to_add = pd.DataFrame(rows_to_add)
            df_all = pd.concat([df_all, df_rows_to_add])
        df_all = df_all.reset_index(drop=True)
        return df_all

# Function to determine individual review stats
def solo_stats(df, author):
    true_positives = ((df['adjusted_decision'] == 'Include') & (df['truth'] == 'Include') & (df['review'] == author)).sum()
    true_negatives = ((df['adjusted_decision'] == 'Exclude') & (df['truth'] == 'Exclude') & (df['review'] == author)).sum()
    false_positives = ((df['adjusted_decision'] == 'Include') & (df['truth'] == 'Exclude') & (df['review'] == author)).sum()
    false_negatives = ((df['adjusted_decision'] == 'Exclude') & (df['truth'] == 'Include') & (df['review'] == author)).sum()
    if true_positives + false_negatives > 0:
        sensitivity = true_positives / (true_positives + false_negatives)
    else:
        sensitivity = "N/A"
    if true_negatives + false_positives > 0:
        specificity = true_negatives / (true_negatives + false_positives)
    else:
        specificity = "N/A"
    if true_positives + true_negatives + false_positives + false_negatives > 0:
        accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    else:
        accuracy = "N/A"
    if true_positives + false_positives > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = "N/A"
    if true_negatives + false_negatives > 0:
        npv = true_negatives / (true_negatives + false_negatives)
    else:
        npv = "N/A"
    if isinstance(precision, float) and isinstance(sensitivity, float) and (precision + sensitivity) > 0:
        f1_score = 2 * (precision * sensitivity) / (precision + sensitivity)
    else:
        f1_score = "N/A"
    
    return (
        f"Sensitivity for {author}: {sensitivity}\n"
        f"Specificity for {author}: {specificity}\n"
        f"Accuracy for {author}: {accuracy}\n"
        f"Precision/PPV for {author}: {precision}\n"
        f"NPV for {author}: {npv}\n"
        f"F1-score for {author}: {f1_score}\n"
    )

In [None]:
def evaluateLLM(service, model, prompt, output_name, df=None, temp=0.2, max_tok=5):  
    if df is None:
        df = df_all.copy(deep=True)

    backup_counter = 1
    # Iterate over each article in dataframe
    for index, row in df.iterrows():

        SR_title = title_dict[row['review']]
        incl_criteria = incl_criteria_dict[row['review']]
        excl_criteria = excl_criteria_dict[row['review']]
        print(f"Processing row {index}, {row['review']}...")

        title = row['title']
        abstract = row['abstract']

        # Set prompt with bias
        messages = get_messages(prompt, SR_title, incl_criteria, excl_criteria, title, abstract)
        success = False
        retry_delay = 1
        max_delay = 3000

        # Retry loop to call the API
        for attempt in range(10):
            try:
                if service == "openai":
                    response = client.chat.completions.create(model=model, messages=messages, max_tokens=5, temperature=0.2)
                    output = response.choices[0].message.content.strip()
                if service == "anthropic-vertex":
                    message = client.messages.create(
                        max_tokens=max_tok,
                        temperature=temp,
                        messages=messages,
                        model=model,
                    )
                    output = message.content[0].text.strip()
                if service == "google-vertex":
                    generation_config = {
                        "max_output_tokens": max_tok,
                        "temperature": temp,
                    }
                    safety_settings = [
                        SafetySetting(
                            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
                        ),
                        SafetySetting(
                            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
                        ),
                        SafetySetting(
                            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
                        ),
                        SafetySetting(
                            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
                            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
                        )
                    ]
                    responses = model.generate_content(
                        [messages],
                        generation_config=generation_config,
                        safety_settings=safety_settings,
                        stream=False,  
                    )
                    output = responses.candidates[0].content.parts[0].text.strip()
                if service == "meta-replicate":
                    input = {
                        "prompt": messages, #takes adjusted prompt
                        "temperature": temp,
                        "max_tokens": max_tok,
                    }
                    response = replicate.run(model, input=input)
                    output = response[0].strip()

                if 'include' in output.lower():
                    output = 'Include'
                    success = True
                    break
                elif 'exclude' in output.lower():
                    output = 'Exclude'
                    success = True
                    break
                else:
                    print(f"Invalid output: {output}, retrying...")
            #Exponential back-off
            except Exception as e:
                print(f"Error, retrying in {retry_delay} seconds... {e}")
                time.sleep(retry_delay)
                retry_delay = min(retry_delay * 2, max_delay)  
                
        # If successful, save output and decision
        if success:
            df.at[index, 'decision'] = output
            print(output)
        else:
            print("Unable to retrieve response after maximum retries, skipping row.")
            continue

        # Backup saving every 10,000 rows
        if (index + 1) % 10000 == 0:
            backup_file_path = f'Results/GPT4o Full Final/{output_name}_backup_{backup_counter}.xlsx'
            df.to_excel(backup_file_path, index=False)
            print(f"Backup saved to {backup_file_path}")
            backup_counter += 1

    # Save final results
    file_path = f'Results/{output_name}.xlsx'
    df.to_excel(file_path, index=False)
    
    # Confusion matrix stats
    true_positives = ((df['decision'] == 'Include') & (df['truth'] == 'Include')).sum()
    true_negatives = ((df['decision'] == 'Exclude') & (df['truth'] == 'Exclude')).sum()
    false_positives = ((df['decision'] == 'Include') & (df['truth'] == 'Exclude')).sum()
    false_negatives = ((df['decision'] == 'Exclude') & (df['truth'] == 'Include')).sum()

    # Calculate sensitivity, specificity, accuracy, precision, NPV, and F1-score
    sensitivity = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else "N/A"
    specificity = true_negatives / (true_negatives + false_positives) if (true_negatives + false_positives) > 0 else "N/A"
    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives) if (true_positives + true_negatives + false_positives + false_negatives) > 0 else "N/A"
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else "N/A"
    npv = true_negatives / (true_negatives + false_negatives) if (true_negatives + false_negatives) > 0 else "N/A"
    f1_score = 2 * (precision * sensitivity) / (precision + sensitivity) if isinstance(precision, float) and isinstance(sensitivity, float) and (precision + sensitivity) > 0 else "N/A"

    print(f"Sensitivity for the entire dataframe: {sensitivity}")
    print(f"Specificity for the entire dataframe: {specificity}")
    print(f"Accuracy for the entire dataframe: {accuracy}")
    print(f"Precision/PPV for the entire dataframe: {precision}")
    print(f"NPV for the entire dataframe: {npv}")
    print(f"F1-score for the entire dataframe: {f1_score}")

    print(f"True Positives: {true_positives}")
    print(f"True Negatives: {true_negatives}")
    print(f"False Positives: {false_positives}")
    print(f"False Negatives: {false_negatives}")
    print(f"               ")

    lines = [
        f"{output_name} Results:",
        "Performance Metrics for the Entire DataFrame:",
        f"Sensitivity for the entire dataframe: {sensitivity}",
        f"Specificity for the entire dataframe: {specificity}",
        f"Accuracy for the entire dataframe: {accuracy}",
        f"Precision (PPV) for the entire dataframe: {precision}",
        f"NPV for the entire dataframe: {npv}",
        f"F1-score for the entire dataframe: {f1_score}",
        "",
        f"True Positives: {true_positives}",
        f"True Negatives: {true_negatives}",
        f"False Positives: {false_positives}",
        f"False Negatives: {false_negatives}",
        "---------------"
    ]

    # Calculate sensitivity and specificity for each author
    for author in SR_list:
        author_metrics = solo_stats(df, author)
        lines.append(author_metrics)
        lines.append("---------------")
        
    text_file_path = f'Results/{output_name}.txt'

    with open(text_file_path, 'w') as file:
        file.write("\n".join(lines))

    return df