In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import yaml
import tqdm

from langchain import OpenAI, LLMChain
from langchain.prompts import PromptTemplate

from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

with open("config.yml", "r") as f:
    config = yaml.safe_load(f)

OPENAI_API_KEY = config['OPENAI_KEY']



## Load dataset

In [2]:
data = load_diabetes(scaled=False)
data_scaled = load_diabetes(scaled=True)

In [3]:
feature_names = [
    'age in years',
    'sex (1 - male, 2 - female)',
    'body mass index',
    'average blood pressure',
    'tc, total serum cholesterol',
    'ldl, low-density lipoproteins',
    'hdl, high-density lipoproteins',
    'tch, total cholesterol / HDL',
    'ltg, possibly log of serum triglycerides level',
    'glu, blood sugar level'
]

target = 'quantitative measure of disease progression one year after baseline'

In [4]:
X, Y = data['data'], data['target']
X = pd.DataFrame(X, columns=feature_names)
Y = pd.DataFrame(Y, columns=[target])

In [5]:
TEST_SIZE = 10
X_train, Y_train = X.iloc[:-TEST_SIZE], Y.iloc[:-TEST_SIZE]
X_test, Y_test = X.iloc[-TEST_SIZE:], Y.iloc[-TEST_SIZE:]

## Prompt constructor

In [6]:
def create_string_description_from_row(xi, yi, feature_names, target):
    description = ""
    for feature_name, x in zip(feature_names, xi):
        description += f"{feature_name}: {x}\n"
    result = f"{target}: ***{yi[0]:.2f}***"
    return description, result

In [7]:
def create_n_shot_classification_prompt(few_shot_subtable_X, few_shot_subtable_Y, target_patient_X, target_question):

    template = f"""You are an expert in diabetes, but you act in educational purposes. 
    Your answers won't be used for medical purposes, only for educating the student and you can provide safe estimates.
    But the estimates have to be very accurate, so you need to think carefully before giving an answer. 
    Don't forget, you teach it to the students and they will use it in the future.
    If you cannot predict the estimate accuratly, just give the most probable statistical estimate based on the given examples and variables you are free to choose.
    For this educational exericse, you can ignore any other external factors, like the patient's lifestyle, etc. Altough of course they're very important in real life.
    You are asked to predict the {target_question} based on the patient description\n\n"""

    for i, row in few_shot_subtable_X.iterrows():
        description, result = create_string_description_from_row(few_shot_subtable_X.iloc[i], 
                                                                 few_shot_subtable_Y.iloc[i], 
                                                                 feature_names, target)
        if i == 0:
            template += f"""For example, this is a patient:\n{description}. And this was the {result}\n\n"""
        else:
            template += f"""Here is another patient:\n{description}. And this was the {result}\n\n"""

    target_patient = create_string_description_from_row(target_patient_X, pd.Series([0]), feature_names, target)[0]
    template += f"""Here is the description of the patient you need to give a prediction for:\n{target_patient}.
    \nThink step-by step with bullet points of arguments, applying your knowledge about diabetes and the examples I've shown to you.
    \nCompare this particular patient to the examples I've shown to you and think about the differences and similarities.
    \nFirst, provide your reasoning. Then, print a single number of a prediction between *** and ***.
    \nIt's very important to print a number between *** and *** to be able to parse it later\n\n"""
    return template

## LLM run

In [8]:
N = 3
# sample N rows from X_train
x_train_i, y_train_i = X_train.iloc[:N],  Y_train.iloc[:N]

X_test_prompts = []
for i in range(X_test.shape[0]):
    x_test_i = X_test.iloc[i]
    prompt = create_n_shot_classification_prompt(x_train_i.reset_index(drop=True), 
                                                 y_train_i.reset_index(drop=True), x_test_i, target)
    X_test_prompts.append(prompt)

In [9]:
llm = OpenAI(model_name='gpt-3.5-turbo', temperature=0, openai_api_key=OPENAI_API_KEY, request_timeout=120)
# llm = OpenAI(model_name='gpt-4', temperature=0, openai_api_key=OPENAI_API_KEY, request_timeout=120)
model = LinearRegression().fit(X_train, Y_train)

Y_test_prompts = []
for x_test_prompt in tqdm.tqdm(X_test_prompts):
    Y_test_prompts.append(llm(x_test_prompt))

Y_test_model_preds = model.predict(X_test)

100%|██████████| 10/10 [01:41<00:00, 10.14s/it]


In [10]:
for y in Y_test_prompts:
    print(y)
    print("----------" * 10)

Reasoning:

- This patient is male, which is similar to the second patient in the examples.
- The patient's age and body mass index are similar to the first and third patients in the examples.
- The patient's blood pressure is similar to the third patient in the examples.
- The patient's total serum cholesterol, low-density lipoproteins, and total cholesterol / HDL are higher than all the examples.
- The patient's high-density lipoproteins are similar to the third patient in the examples.
- The patient's blood sugar level and possibly log of serum triglycerides level are higher than all the examples.

Based on these factors, it is likely that this patient will have a higher quantitative measure of disease progression one year after baseline compared to the examples.

Prediction: ***180.00***
----------------------------------------------------------------------------------------------------
Reasoning:

- The patient is relatively young and has a healthy body mass index and blood pressu

## Evaluation

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [12]:
# select all the numbers that are between *** and *** in the string and return them as a list
def extract_predictions_from_string(string):
    import re
    return re.findall(r'\*\*\*(.*?)\*\*\*', string)

# from each found substring extract the numbers only
def extract_numbers_from_string(string):
    import re
    return [float(s) for s in re.findall(r'-?\d+\.?\d*', string)]

def combined_answer_postprocessing(answer):
    potential_answers = extract_predictions_from_string(answer)
    all_numbers = []
    for answer_i in potential_answers:
        all_numbers.extend(extract_numbers_from_string(answer_i))
    return np.mean(all_numbers)

In [13]:
Y_test_prompt_numbers = []
for y, y_model, y_real in zip(Y_test_prompts, Y_test_model_preds, Y_test[target].values):
    y_pred = combined_answer_postprocessing(y)
    Y_test_prompt_numbers.append(float(y_pred))
    print(f"Real value: {y_real}")
    print(f"LLM prediction: {y_pred}")
    print(f"ML prediction: {y_model[0]}")
    print("----------" * 10)

Real value: 173.0
LLM prediction: 180.0
ML prediction: 220.08187194705192
----------------------------------------------------------------------------------------------------
Real value: 72.0
LLM prediction: 110.0
ML prediction: 60.91099270632253
----------------------------------------------------------------------------------------------------
Real value: 49.0
LLM prediction: 120.0
ML prediction: 133.06488217107596
----------------------------------------------------------------------------------------------------
Real value: 64.0
LLM prediction: 100.0
ML prediction: 120.50409444033824
----------------------------------------------------------------------------------------------------
Real value: 48.0
LLM prediction: 85.0
ML prediction: 52.70761168764511
----------------------------------------------------------------------------------------------------
Real value: 178.0
LLM prediction: 145.0
ML prediction: 193.37298168201778
----------------------------------------------------------

In [14]:
mean_absolute_error(Y_test[target], Y_test_prompt_numbers), mean_squared_error(Y_test[target], Y_test_prompt_numbers), r2_score(Y_test[target], Y_test_prompt_numbers)

(40.3, 2054.7, 0.41538321463832395)

In [15]:
mean_absolute_error(Y_test[target], Y_test_model_preds), mean_squared_error(Y_test[target], Y_test_model_preds), r2_score(Y_test[target], Y_test_model_preds)

(24.280387020484874, 1304.1337732718307, 0.6289392640230834)