# Extraction of data parameters from NICE (UK) HTA document for the drug Ivabradine for chronic heart failure

In [1]:
import json
import pandas as pd
import re
import os
import time
import anthropic
from pypdf import PdfReader

In [2]:
# Enter API key
client = anthropic.Anthropic(
    api_key="",
)

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)

In [4]:
# File path for input HTA document
input_path = "../data/HTA-docs/NICE-ivabradine.pdf"

# Output path to store extracted data parameters in csv file format
output_path = '../data/processed/claude'

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
# Extract filename
filename_with_extension = input_path.split('/')[-1]
filename = filename_with_extension.split('.')[0]

In [6]:
# Set up function for API call
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        temperature=0.0,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

# Extract data parameters

In [7]:
# Instructions for Claude 
instructions = '''
You are an expert at reading Health Technology Assessment (HTA) documents from the European Union. You can 
accurately extract the requested information from an HTA document. You will be asked a series of questions about
the information in an HTA document. Read the document carefully, and answer the questions accurately. 

Provide the results in English, even if the source document is not in English.

On the basis of the provided Health Technology Assessment (HTA) document, answer the following questions. 
If you can't find the answer to a question, return 'None' as the answer for that question.

Return the result in valid JSON object format,
with keys "hta_id", "assessment_type", "internal_identifier", "inn", "brand_name", "assessment_date",
"indication", "final_recommendation", "comparator", "outcome_rea", "outcome_cea", "outcome_bi",
"managed_entry_agreements", "clinical_restrictions".

If there is no information present, print an empty JSON object, like this: {}.

Questions to be answered:

1. HTA ID: Which HTA body is performing the assessment? This answer should be the value corresponding to the key "hta_id".
2. Assessment type: What type of assessment is this - is this a first assessment, a reassessment, 
   or an indication broadening? Answer only with one of these three options - 
   first assessment, reassessment, indication broadening. 
   This answer should be the value corresponding to the key "assessment_type".
3. Internal identifier: What is the HTA document's internal identifier? This answer should be the value corresponding to the key "internal_identifier".
4. INN: What is the interational non-propreitary name of the drug(s)? 
   This answer should be the value corresponding to the key "inn".
5. Brand name: What is the brand name of the drug? 
   This answer should be the value corresponding to the key "brand_name".
6. Assessment date: On which date is this recommendation published? 
   This answer should be the value corresponding to the key "assessment_date".
7. Indication: What is the broad medical indication (diagnosis) for which this recommendation is made? 
   This answer should be the value corresponding to the key "indication".
8. Final recommendation: What is the final recommendation for the drug-indication combination(s) 
   that is(are) assessed? This answer should be the value corresponding to the key "final recommendation".
9. Comparator: What is(are) the INN of the drug(s) that is(are) used as a comparator? 
   This answer should be the value corresponding to the key "comparator". 
10. Outcome REA: What is the conclusion about the relative effectiveness of the assessed health technology? 
    Answer only with 'positive' (if the conclusion is positive) or 'negative' (if the conclusion is negative).
    This answer should be the value corresponding to the key "outcome_rea".
11. Outcome CEA: What is the conclusion about the cost-effectiveness of the assessed health technology?
    Answer only with 'positive' (if the conclusion is positive) or 'negative' (if the conclusion is negative). 
    This answer should be the value corresponding to the key "outcome_cea".
12. Outcome BI: What is the conclusion about the budget impact of the assessed health technology? 
    Answer only with the conclusion, if it was performed, and 'None' otherwise. 
    This answer should be the value corresponding to the key "outcome_bi".
13. Managed entry agreements: Is any OECD-defined managed entry agreement proposed? If so, which class?
    Answer only with the class name if there is a managed entry agreement proposed, and 'None' otherwise. 
    This should be the value corresponding to the key "managed_entry_agreements".
14. Clinical restrictions: Are any clinical restrictions stated in the recommendation? Answer only with the 
    clinical restrictions if any clinical restrictions are stated, and 'None' otherwise. 
    This should be the value corresponding to the key "clinical_restrictions".
    
Return only the JSON object containing the answers to these questions, and no other text, code, or explanation.

Provide the answers in English.
'''

In [8]:
# Read HTA document
reader = PdfReader(input_path)
number_of_pages = len(reader.pages)
text = ''.join([page.extract_text() for page in reader.pages])

In [9]:
# Text of HTA document
text

"Ivabradine f or tr eating \nchronic he art failur e \nTechnology appraisal guidance \nPublished: 28 No vember 2012 \nwww .nice.or g.uk/guidance/ta267 \n© NICE 202 4. All right s reserved. Subject t o Notice of right s (https://www .nice.or g.uk/t erms-and-\nconditions#notice-of -right s).Your r esponsi bility \nThe r ecommendations in t his guidance r epresent t he view of NICE, arriv ed at aft er car eful \nconsideration of t he evidence a vailable. When e xercising t heir judgement, healt h \nprofessionals ar e expect ed to tak e this guidance fully int o account, alongside t he \nindividual needs, pr eferences and v alues of t heir patient s. The application of t he \nrecommendations in t his guidance is at t he discr etion of healt h professionals and t heir \nindividual patient s and do not o verride t he responsibility of healt hcare professionals t o \nmake decisions appr opriat e to the cir cumstances of t he individual patient, in consultation \nwith the patient and/or t heir

In [10]:
# Create prompt
prompt = instructions + ' HTA document text: ' + text

In [11]:
%%time

# Number of runs
num_runs = 3

# List to store result dataframes
list_df = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # Call the API
    response = get_completion(prompt)

    # Post-processing
    result = re.search(r'\{.*\}', response, re.DOTALL)
    json_data_string = result.group()

    # Read data into JSON object 
    data = json.loads(json_data_string)

    # Output dataframe from this run
    df = pd.DataFrame([data])

    # Save dataframe to csv file
    df.to_csv(f'{output_path}/{filename}_run_{run}.csv', index=False)

    # Append to list of dataframes
    list_df.append(df)
    
    print(f'Finished run {run}.\n')
    
    # Sleep time to prevent exceeding per minute rate limit for tokens
    time.sleep(60)

Starting run 0 ...
Finished run 0.

Starting run 1 ...
Finished run 1.

Starting run 2 ...
Finished run 2.

CPU times: user 63.3 ms, sys: 13.6 ms, total: 76.9 ms
Wall time: 4min 29s


In [12]:
# Concatenate the dataframes from different runs
df_concat = pd.concat([df for df in list_df], ignore_index=True).T

# Display extracted parameters

In [13]:
# Display the results from all runs
df_concat

Unnamed: 0,0,1,2
hta_id,NICE,NICE,NICE
assessment_type,first assessment,first assessment,first assessment
internal_identifier,TA267,TA267,TA267
inn,ivabradine,ivabradine,ivabradine
brand_name,Procoralan,Procoralan,Procoralan
assessment_date,28 November 2012,28 November 2012,28 November 2012
indication,chronic heart failure,chronic heart failure,chronic heart failure
final_recommendation,"Ivabradine is recommended as an option for treating chronic heart failure for people with New York Heart Association (NYHA) class II to IV stable chronic heart failure with systolic dysfunction and who are in sinus rhythm with a heart rate of 75 beats per minute (bpm) or more and who are given ivabradine in combination with standard therapy including beta-blocker therapy, angiotensin-converting enzyme (ACE) inhibitors and aldosterone antagonists, or when beta-blocker therapy is contraindicated or not tolerated and with a left ventricular ejection fraction of 35% or less.","Ivabradine is recommended as an option for treating chronic heart failure for people with New York Heart Association (NYHA) class II to IV stable chronic heart failure with systolic dysfunction and who are in sinus rhythm with a heart rate of 75 beats per minute (bpm) or more and who are given ivabradine in combination with standard therapy including beta-blocker therapy, angiotensin-converting enzyme (ACE) inhibitors and aldosterone antagonists, or when beta-blocker therapy is contraindicated or not tolerated and with a left ventricular ejection fraction of 35% or less.","Ivabradine is recommended as an option for treating chronic heart failure for people with New York Heart Association (NYHA) class II to IV stable chronic heart failure with systolic dysfunction and who are in sinus rhythm with a heart rate of 75 beats per minute (bpm) or more and who are given ivabradine in combination with standard therapy including beta-blocker therapy, angiotensin-converting enzyme (ACE) inhibitors and aldosterone antagonists, or when beta-blocker therapy is contraindicated or not tolerated and with a left ventricular ejection fraction of 35% or less."
comparator,placebo,placebo,placebo
outcome_rea,positive,positive,positive
