# Import model

In [None]:
import torch
print(torch.cuda.is_available())
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the model name
model_name = "BramVanroy/fietje-2-chat"

# Download the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move the quantized model back to GPU for inference
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Import datasets

In [None]:
import pandas as pd

#df_bin_examples = pd.read_csv('df_bin_examples_acsess.csv', sep= '\t')
#df_class_examples = pd.read_csv('df_class_examples_acsess.csv', sep= '\t')
folds = pd.read_csv('folds.csv', sep= '\t')

In [None]:
df_bin_examples_fold1 = pd.read_csv('df_bin_examples_acsess_fold1.csv', sep= '\t')
df_class_examples_fold1 = pd.read_csv('df_class_examples_acsess_fold1.csv', sep= '\t')
df_bin_examples_fold2 = pd.read_csv('df_bin_examples_acsess_fold2.csv', sep= '\t')
df_class_examples_fold2 = pd.read_csv('df_class_examples_acsess_fold2.csv', sep= '\t')
df_bin_examples_fold3 = pd.read_csv('df_bin_examples_acsess_fold3.csv', sep= '\t')
df_class_examples_fold3 = pd.read_csv('df_class_examples_acsess_fold3.csv', sep= '\t')
df_bin_examples_fold4 = pd.read_csv('df_bin_examples_acsess_fold4.csv', sep= '\t')
df_class_examples_fold4 = pd.read_csv('df_class_examples_acsess_fold4.csv', sep= '\t')
df_bin_examples_fold5 = pd.read_csv('df_bin_examples_acsess_fold5.csv', sep= '\t')
df_class_examples_fold5 = pd.read_csv('df_class_examples_acsess_fold5.csv', sep= '\t')

In [None]:
df_bin_folds = [df_bin_examples_fold1, df_bin_examples_fold2, df_bin_examples_fold3, df_bin_examples_fold4, df_bin_examples_fold5]
df_class_folds = [df_class_examples_fold1, df_class_examples_fold2, df_class_examples_fold3, df_class_examples_fold4, df_class_examples_fold5]

In [None]:
five_folds = folds[folds['fold'] != 'examples'] 

In [None]:
import re

# Binary task

In [None]:
# define Llama-function
def LlamaBin(row, examples_list_folds):
    
    sentence = row['sentence']
    fold = row['fold']
    match = re.search(r'\d+', fold)
    fold = int(match.group()) - 1
    
    examples_list = examples_list_folds[fold]
    
    if len(examples_list) > 0:
        examples_string = "# Voorbeelden\n"
        for i in range(len(examples_list)):
            example = examples_list[i]
            string_part, answer = example[0], example[1]
            examples_string += f"Patiënt {i+1}: {string_part}\n"
            examples_string += f"Antwoord {i+1}: {answer}\n"
    elif len(examples_list) == 0:
        examples_string = ""
    
    if len(sentence) > 1000:
        sentence = sentence[:1000]
    else:
        pass
    
    prompt = f"""
    # Vraag
    Geef aan voor de volgende patiëntinformatie of deze een beschrijving van een WHO Performance Status bevat. 
    Een WHO Performance Status beschrijf de mate waarin een patiënt in staat is zelf activiteiten te ondernemen, 
    onder andere op het gebied van zelfzorg, huishoudelijk activiteit en lichaamsbeweging. 
    Geef antwoord '0' wanneer de patiëntinformatie geen Performance Status bevat. Geef antwoord '1' wanneer de patiëntinformatie wél een Performance Status bevat.

    {examples_string}
    
    # Patiëntinformatie 
    {sentence}

    # Antwoord
    Antwoord:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length = 2000, eos_token_id=tokenizer.encode('\n'))

    #print(tokenizer.decode(outputs[0], skip_special_tokens=True))

    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    new_label = output.split(": ")[-1]
    return new_label




In [None]:
# define results dataset
acsess_bin_results = five_folds[['note_nr', 'sentence',
       'manual_sentence_labels', 'relevance_manual', 'round', 'annotator',
       'set', 'fold']]

#acsess_bin_results = acsess_bin_results[:5]
#five_folds = five_folds[:5]


#acsess_bin_results = acsess_bin_results.groupby('fold').head(3)
#five_folds = five_folds.groupby('fold').head(3)

In [None]:
import re
import numpy as np

def extract_digit(string):
    match = re.search(r'\d', string)
    if match:
        return int(match.group())
    else:
        return np.nan


def perform_llama_binary(examples_list_folds, five_folds):
    #print(examples_list)
    new_labels = []
    
    for index, row in five_folds.iterrows():
        new_label = LlamaBin(row, examples_list_folds)
        new_label = extract_digit(new_label)
        new_labels.append(new_label)
    return new_labels
    
    
for examples_num in range(0,6):
    print("K=",examples_num)
    if examples_num > 0:
        examples_list_folds = []
        for i in range(len(df_bin_folds)):
            examples = df_bin_folds[i][df_bin_folds[i]['k'] == examples_num]
            examples_list = []
            for index, row in examples.iterrows():
                examples_list.append([row['example'], row['label']])
            examples_list_folds.append(examples_list)
    elif examples_num == 0:
        examples_list_folds = [[],[],[],[],[]]
    
    new_labels = perform_llama_binary(examples_list_folds, five_folds)
    acsess_bin_results[f"k={examples_num}"] = new_labels

In [None]:
#print(acsess_bin_results)

In [None]:
#acsess_bin_results.to_csv('acsess_bin_results_folds.csv', sep='\t')

# Regression task

In [None]:
# define Llama-function
def LlamaClasses(row, examples_list_folds):
     
    sentence = row['sentence']
    fold = row['fold']
    match = re.search(r'\d+', fold)
    fold = int(match.group()) - 1

    
    examples_list = examples_list_folds[fold]
    
    
    
    if len(examples_list) > 0:
        examples_string = "# Voorbeelden\n"
        for i in range(len(examples_list)):
            example = examples_list[i]
            string_part, answer = example[0], example[1]
            examples_string += f"Patiënt {i+1}: {string_part}\n"
            examples_string += f"Antwoord {i+1}: {answer}\n"
    elif len(examples_list) == 0:
        examples_string = ""
    
    if len(sentence) > 1000:
        sentence = sentence[:1000]
    else:
        pass
    
    prompt = f"""
    # Vraag
    Geef een WHO Performance Status score van een patiënt. De waardes zijn als volgt: 
    * 0: Volledig actief, in staat om alle voorziekte prestaties zonder beperking uit te voeren.
    * 1: Beperkt in fysiek inspannende activiteit, maar ambulant en in staat om werk van lichte of sedentaire aard te verrichten, bijv. Licht huiswerk, kantoorwerk.
    * 2: Ambulant en geschikt voor alle zelfzorg, maar niet in staat om enige werkactiviteiten uit te voeren. Tot en met meer dan 50% van de wekelijkse uren.
    * 3: Alleen beperkte zelfverzorging, beperkt tot bed of meer dan 50% wekelijkse uren.
    * 4: Kan geen zelfzorg uitvoeren. Heel beperkt tot bed of stoel.
    * 5: Dood
    
    {examples_string}
    
    # Patiëntinformatie 
    {sentence}

    # Antwoord
    Antwoord:"""

    #print(prompt)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length = 5000, eos_token_id=tokenizer.encode('\n'))

    #print(tokenizer.decode(outputs[0], skip_special_tokens=True))

    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    new_label = output.split(": ")[-1]
    return new_label




In [None]:
# define results dataset
acsess_class_results = five_folds[['note_nr', 'sentence',
       'manual_sentence_labels', 'relevance_manual', 'round', 'annotator',
       'set', 'fold']]


acsess_class_results = acsess_class_results[acsess_class_results['relevance_manual'] == 1]
acsess_class_results = acsess_class_results.reset_index(drop=True)

five_folds_classes = five_folds[five_folds['relevance_manual'] == 1]
five_folds_classes = five_folds_classes.reset_index(drop=True)

#acsess_class_results = acsess_class_results.groupby('fold').head(3)
#five_folds_classes = five_folds_classes.groupby('fold').head(3)

#print(len(acsess_class_results))
#print(len(five_folds_classes))



In [None]:
print()

In [None]:
import re
import numpy as np

def extract_digit(string):
    match = re.search(r'\d', string)
    if match:
        return int(match.group())
    else:
        return np.nan


def perform_llama_classes(examples_list_folds, five_folds_classes):
    #print(examples_list)
    new_labels = []
    
    for index, row in five_folds_classes.iterrows():
        new_label = LlamaClasses(row, examples_list_folds)
        new_label = extract_digit(new_label)
        new_labels.append(new_label)
    return new_labels
    
    
for examples_num in range(0,6):
    print("K=",examples_num)

    if examples_num > 0:
        examples_list_folds = []
        for i in range(len(df_class_folds)):
            examples = df_class_folds[i][df_class_folds[i]['k'] == examples_num]
            examples_list = []
            for index, row in examples.iterrows():
                examples_list.append([row['example'], row['label']])
            examples_list_folds.append(examples_list)
    elif examples_num == 0:
        examples_list_folds = [[],[],[],[],[]]

    new_labels = perform_llama_classes(examples_list_folds, five_folds_classes)
    print('len results df:', len(acsess_class_results))
    acsess_class_results[f"k={examples_num}"] = new_labels

In [None]:
#print(len(five_folds_classes))

In [None]:
#print(acsess_class_results)

In [None]:
#acsess_bin_results.to_csv('acsess_bin_results_folds_new_examples.csv', sep='\t')
acsess_class_results.to_csv('acsess_class_results_folds_new_examples.csv', sep='\t')

In [None]:
#acsess_bin_results.to_csv('acsess_bin_results_folds_new_examples.csv', sep='\t')