# Extraction of data parameters from HAS (France) HTA document for the drug Ivabradine for chronic heart failure

In [1]:
import json
import pandas as pd
import re
import os
import time
import anthropic
from pypdf import PdfReader

In [2]:
# Enter API key
client = anthropic.Anthropic(
    api_key="",
)

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)

In [4]:
# File path for input HTA document
input_path = "../data/HTA-docs/HAS-ivabradine.pdf"

# Output path to store extracted data parameters in csv file format
output_path = '../data/processed/claude'

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
# Extract filename
filename_with_extension = input_path.split('/')[-1]
filename = filename_with_extension.split('.')[0]

In [6]:
# Set up function for API call
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        temperature=0.0,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

# Extract data parameters

In [7]:
# Instructions for Claude 
instructions = '''
You are an expert at reading Health Technology Assessment (HTA) documents from the European Union. You can 
accurately extract the requested information from an HTA document. You will be asked a series of questions about
the information in an HTA document. Read the document carefully, and answer the questions accurately. 

Provide the results in English, even if the source document is not in English.

On the basis of the provided Health Technology Assessment (HTA) document, answer the following questions. 
If you can't find the answer to a question, return 'None' as the answer for that question.

Return the result in valid JSON object format,
with keys "hta_id", "assessment_type", "internal_identifier", "inn", "brand_name", "assessment_date",
"indication", "final_recommendation", "comparator", "outcome_rea", "outcome_cea", "outcome_bi",
"managed_entry_agreements", "clinical_restrictions".

If there is no information present, print an empty JSON object, like this: {}.

Questions to be answered:

1. HTA ID: Which HTA body is performing the assessment? This answer should be the value corresponding to the key "hta_id".
2. Assessment type: What type of assessment is this - is this a first assessment, a reassessment, 
   or an indication broadening? Answer only with one of these three options - 
   first assessment, reassessment, indication broadening. 
   This answer should be the value corresponding to the key "assessment_type".
3. Internal identifier: What is the HTA document's internal identifier? This answer should be the value corresponding to the key "internal_identifier".
4. INN: What is the interational non-propreitary name of the drug(s)? 
   This answer should be the value corresponding to the key "inn".
5. Brand name: What is the brand name of the drug? 
   This answer should be the value corresponding to the key "brand_name".
6. Assessment date: On which date is this recommendation published? 
   This answer should be the value corresponding to the key "assessment_date".
7. Indication: What is the broad medical indication (diagnosis) for which this recommendation is made? 
   This answer should be the value corresponding to the key "indication".
8. Final recommendation: What is the final recommendation for the drug-indication combination(s) 
   that is(are) assessed? This answer should be the value corresponding to the key "final recommendation".
9. Comparator: What is(are) the INN of the drug(s) that is(are) used as a comparator? 
   This answer should be the value corresponding to the key "comparator". 
10. Outcome REA: What is the conclusion about the relative effectiveness of the assessed health technology? 
    Answer only with 'positive' (if the conclusion is positive) or 'negative' (if the conclusion is negative).
    This answer should be the value corresponding to the key "outcome_rea".
11. Outcome CEA: What is the conclusion about the cost-effectiveness of the assessed health technology?
    Answer only with 'positive' (if the conclusion is positive) or 'negative' (if the conclusion is negative). 
    This answer should be the value corresponding to the key "outcome_cea".
12. Outcome BI: What is the conclusion about the budget impact of the assessed health technology? 
    Answer only with the conclusion, if it was performed, and 'None' otherwise. 
    This answer should be the value corresponding to the key "outcome_bi".
13. Managed entry agreements: Is any OECD-defined managed entry agreement proposed? If so, which class?
    Answer only with the class name if there is a managed entry agreement proposed, and 'None' otherwise. 
    This should be the value corresponding to the key "managed_entry_agreements".
14. Clinical restrictions: Are any clinical restrictions stated in the recommendation? Answer only with the 
    clinical restrictions if any clinical restrictions are stated, and 'None' otherwise. 
    This should be the value corresponding to the key "clinical_restrictions".
    
Return only the JSON object containing the answers to these questions, and no other text, code, or explanation.

Provide the answers in English.
'''

In [8]:
# Read HTA document
reader = PdfReader(input_path)
number_of_pages = len(reader.pages)
text = ''.join([page.extract_text() for page in reader.pages])

In [9]:
# Text of HTA document
text

"Avis 2 suite audition – corrections matérielles 1/12 ² \n \n \n \n \n \n \nCOMMISSION DE LA TRANSPARENCE \n \n \nAVIS  \n \n19 septembre 2012  \n \nL’avis de la Commission de la transparence du 6 jui n 2012 \na fait l’objet d’une audition le 19 septembre 2012 \n \n \nPROCORALAN 5 mg, comprimé pelliculé  \nB/56 (CIP : 371 676-2) \nB/100 (CIP : 567 208-1) \nPROCORALAN 7,5 mg, comprimé pelliculé  \nB/56 (CIP : 371 679-1) \nB/100 (CIP : 567 209-8) \n \nLaboratoires SERVIER \n \nivabradine \n \nCode ATC  : C01EB17 (autre médicament de cardiologie) \n \nListe I \n \nDate de l'AMM  (centralisée) : 25 octobre 2005 \n \nDate de l’extension d’indication chez les patients insuffisants cardiaques  : 09 février 2012 \n \n \nMotif de la demande  : Inscription Sécurité Sociale (B/56 seulement) et  Collectivités (B/56 et \nB/100) dans l’extension d’indication : « Traitement  de l’insuffisance cardiaque chronique : \nL’ivabradine est indiquée dans le traitement de l’i nsuffisance cardiaque chronique d

In [10]:
# Create prompt
prompt = instructions + ' HTA document text: ' + text

In [11]:
%%time

# Number of runs
num_runs = 3

# List to store output JSON objects
list_json = []

# List to store result dataframes
list_df = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # Call the API
    response = get_completion(prompt)

    # Post-processing
    result = re.search(r'\{.*\}', response, re.DOTALL)
    json_data_string = result.group()

    # Read data into JSON object
    data = json.loads(json_data_string)

    # Output dataframe from this run
    df = pd.DataFrame([data])

    # Save dataframe to csv file
    df.to_csv(f'{output_path}/{filename}_run_{run}.csv', index=False)

    # Append to list of JSON objects
    list_json.append(data)
    
    # Append to list of dataframes
    list_df.append(df)
    
    print(f'Finished run {run}.\n')
    
    # Sleep time to prevent exceeding per minute rate limit for tokens
    time.sleep(60)

Starting run 0 ...
Finished run 0.

Starting run 1 ...
Finished run 1.

Starting run 2 ...
Finished run 2.

CPU times: user 54.6 ms, sys: 13.9 ms, total: 68.4 ms
Wall time: 4min 6s


In [12]:
# Concatenate the dataframes from different runs
df_concat = pd.concat([df for df in list_df], ignore_index=True).T

# Display extracted parameters

In [13]:
# Display the results from all runs
df_concat

Unnamed: 0,0,1,2
hta_id,Commission de la Transparence,Commission de la Transparence,Commission de la Transparence
assessment_type,indication broadening,indication broadening,indication broadening
internal_identifier,Avis 2 suite audition – corrections matérielles,Avis 2 suite audition – corrections matérielles,Avis 2 suite audition – corrections matérielles
inn,ivabradine,ivabradine,ivabradine
brand_name,PROCORALAN,PROCORALAN,PROCORALAN
assessment_date,19 septembre 2012,19 septembre 2012,19 septembre 2012
indication,"Traitement de l'insuffisance cardiaque chronique de classe NYHA II à IV avec dysfonction systolique, chez les patients en rythme sinusal et dont la fréquence cardiaque est supérieure ou égale à 75 bpm, en association au traitement standard comprenant les bêtabloquants, ou en cas de contre-indication ou d'intolérance aux bêtabloquants","Traitement de l'insuffisance cardiaque chronique de classe NYHA II à IV avec dysfonction systolique, chez les patients en rythme sinusal et dont la fréquence cardiaque est supérieure ou égale à 75 bpm, en association au traitement standard comprenant les bêtabloquants, ou en cas de contre-indication ou d'intolérance aux bêtabloquants","Traitement de l'insuffisance cardiaque chronique de classe NYHA II à IV avec dysfonction systolique, chez les patients en rythme sinusal et dont la fréquence cardiaque est supérieure ou égale à 75 bpm, en association au traitement standard comprenant les bêtabloquants, ou en cas de contre-indication ou d'intolérance aux bêtabloquants"
final_recommendation,Avis favorable à l'inscription sur les listes des spécialités remboursables aux assurés sociaux et des médicaments agréés à l'usage des collectivités et divers services publics dans l'indication,Avis favorable à l'inscription sur les listes des spécialités remboursables aux assurés sociaux et des médicaments agréés à l'usage des collectivités et divers services publics dans l'indication,Avis favorable à l'inscription sur les listes des spécialités remboursables aux assurés sociaux et des médicaments agréés à l'usage des collectivités et divers services publics dans l'indication
comparator,placebo,placebo,placebo
outcome_rea,positive,positive,positive


# Translate non-English output fields

In [14]:
instructions = '''
You will be given a JSON object with information extracted from a Health Technology Assessment document. 
If the value corresponding to any key is not in English, translate it into English.

The output should be this modified JSON object with all values in English. Print only this JSON object, 
and no other text, code, or explanation.

Return the output in valid JSON format.
'''

In [15]:
%%time

# Number of runs
num_runs = 3

# List to store output JSON objects
#list_json = []

# List to store result dataframes
list_df_translated = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # Create prompt
    prompt = instructions + ' JSON object: ' + json.dumps(list_json[run])
    
    # Call the API
    response = get_completion(prompt)

    # Post-processing
    result = re.search(r'\{.*\}', response, re.DOTALL)
    json_data_string = result.group()

    # Read data into JSON object
    data = json.loads(json_data_string)

    # Output dataframe from this run
    df = pd.DataFrame([data])

    # Save dataframe to csv file
    df.to_csv(f'{output_path}/{filename}_run_{run}_translated.csv', index=False)
    
    # Append to list of dataframes
    list_df_translated.append(df)
    
    print(f'Finished run {run}.\n')

Starting run 0 ...
Finished run 0.

Starting run 1 ...
Finished run 1.

Starting run 2 ...
Finished run 2.

CPU times: user 29.1 ms, sys: 6.52 ms, total: 35.6 ms
Wall time: 45.3 s


In [16]:
# Concatenate the dataframes from different runs
df_concat = pd.concat([df for df in list_df_translated], ignore_index=True).T

# Display extracted parameters, translated

In [17]:
# Display the results from all runs
df_concat

Unnamed: 0,0,1,2
hta_id,Transparency Committee,Transparency Committee,Transparency Committee
assessment_type,indication broadening,indication broadening,indication broadening
internal_identifier,Opinion 2 following hearing - material corrections,Opinion 2 following hearing - material corrections,Opinion 2 following hearing - material corrections
inn,ivabradine,ivabradine,ivabradine
brand_name,PROCORALAN,PROCORALAN,PROCORALAN
assessment_date,"September 19, 2012","September 19, 2012","September 19, 2012"
indication,"Treatment of chronic heart failure of NYHA class II to IV with systolic dysfunction, in patients in sinus rhythm and whose heart rate is greater than or equal to 75 bpm, in combination with standard treatment including beta-blockers, or in case of contraindication or intolerance to beta-blockers","Treatment of chronic heart failure of NYHA class II to IV with systolic dysfunction, in patients in sinus rhythm and whose heart rate is greater than or equal to 75 bpm, in combination with standard treatment including beta-blockers, or in case of contraindication or intolerance to beta-blockers","Treatment of chronic heart failure of NYHA class II to IV with systolic dysfunction, in patients in sinus rhythm and whose heart rate is greater than or equal to 75 bpm, in combination with standard treatment including beta-blockers, or in case of contraindication or intolerance to beta-blockers"
final_recommendation,Favorable opinion for the inclusion on the lists of specialties reimbursable to social security insured persons and of medicines approved for use by communities and various public services in the indication,Favorable opinion for the inclusion on the lists of specialties reimbursable to social security insured persons and of medicines approved for use by communities and various public services in the indication,Favorable opinion for the inclusion on the lists of specialties reimbursable to social security insured persons and of medicines approved for use by communities and various public services in the indication
comparator,placebo,placebo,placebo
outcome_rea,positive,positive,positive
