# Import packages and data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from vllm import LLM, SamplingParams
from datetime import datetime
from datetime import date

  from .autonotebook import tqdm as notebook_tqdm




2024-12-10 15:25:59,692	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:

# Reading csv-Files

DF = pd.read_csv('Testdata_empty.csv', sep=';')   
df_0 = pd.DataFrame(DF)

df_0

Unnamed: 0,PatientAccountID,MPINumber,AssessmentID,Source,CollectedDT,Finding,FindingName,FindingDataType,Value,AdditionalInformation


## Data preparation

In [3]:
# Define timeformats
timeformat = "%b %d %Y  %I:%M%p"
birthdayformat = "%m/%Y"

# Define process to add Age-column
def process_dataframe(df):
    df['CollectedDT'] = pd.to_datetime(df['CollectedDT'], format=timeformat, errors='coerce')
    
    Geb_0 = df[df['Finding'] == "Geburtsdatum"]
    Geb_1 = Geb_0[['MPINumber', 'Value']].copy()
    Geb_1 = Geb_1.drop_duplicates()
    Geb_1 = Geb_1.rename(columns={"Value": "Birthday"})
    
    Geb_2 = Geb_1.loc[:, ['MPINumber', 'Birthday']].copy()
    
    Geb_2['Birthday'] = pd.to_datetime(Geb_2['Birthday'], format=birthdayformat, errors='coerce')
    # print(Geb_2.head())
    Geb_3 = pd.merge(df, Geb_2, how="left", on="MPINumber")
    
    Geb_3['Age'] = ((Geb_3['CollectedDT'] - Geb_3['Birthday']).dt.days) // 365.25
    # Geb_3['Age'] = Geb_3['Age'].astype("string")
    # Geb_3['Age'] = Geb_3['Age'].str.replace('.0', '', regex=False)
    
    # print(Geb_3.head())
    return Geb_3


# Update original variables
df_0 = process_dataframe(df_0)

# Delete unnecessary rows
#del df_0['Unnamed: 0']
del df_0['FindingName']
del df_0['FindingDataType']
del df_0['PatientAccountID']
del df_0['AssessmentID']
del df_0['Birthday']

# Avoid Value Errors due to Na / NaN values
df_0 = df_0.fillna(0)


print(df_0.head())


Empty DataFrame
Columns: [Source, CollectedDT, Finding, Value, AdditionalInformation, MPINumber, Age]
Index: []


In [4]:
# Filter the dataframe for patients with relevant diagnosis/ ICD10-Code

# Choose reelvant diagnosis
ICD_10_Code = 'H36'

# Filter the data
diagnosed_patients = df_0[(df_0['Finding'] == 'Diagnose') & (df_0['Value'].str.contains(ICD_10_Code))]['MPINumber'].unique()

# Create new data frame containing only the relevant patients
df_P = df_0[df_0['MPINumber'].isin(diagnosed_patients)]


# Choose your LLM

In [5]:
llm = LLM("TheBloke/Mistral-7B-Instruct-v0.2-AWQ", max_model_len=32768/4, dtype="auto", quantization="awq")

# llm = LLM("casperhansen/llama-3-8b-instruct-awq", max_model_len=32768/4, dtype="auto", quantization="awq")

# llm = LLM("TheBloke/CodeLlama-7B-Instruct-AWQ", max_model_len=32768/4, dtype="half", quantization="awq")

INFO 12-10 15:26:06 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 12-10 15:26:06 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='TheBloke/Mistral-7B-Instruct-v0.2-AWQ', speculative_config=None, tokenizer='TheBloke/Mistral-7B-Instruct-v0.2-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=TheBlo

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.23it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.23it/s]



INFO 12-10 15:27:50 model_runner.py:1077] Loading model weights took 3.8814 GB
INFO 12-10 15:27:52 worker.py:232] Memory profiling results: total_gpu_memory=15.74GiB initial_memory_usage=5.14GiB peak_torch_memory=4.73GiB memory_usage_post_profile=5.19GiB non_torch_memory=1.30GiB kv_cache_size=8.13GiB gpu_memory_utilization=0.90
INFO 12-10 15:27:52 gpu_executor.py:113] # GPU blocks: 4163, # CPU blocks: 2048
INFO 12-10 15:27:52 gpu_executor.py:117] Maximum concurrency for 8192 tokens per request: 8.13x
INFO 12-10 15:27:55 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-10 15:27:55 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 12-

In [6]:
# Get tokenizer for chat template


# Use this for Mistral and Llama3

tokenizer = llm.get_tokenizer()


# Use this for CodeLlama

#from transformers import AutoTokenizer
#tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf", trust_remote_code=True)

# Questions

## B50

In [7]:
Question_B_1 = "How many patients are there?"
Question_B_2 = "Plot the gender distribution in a pie chart, include percentages."
Question_B_3 = "Plot a bar graph for the number of patients per age. Remember to only count each patient/MPINumber once."
Question_B_4 = "Count the number of patients for each unique diagnosis. Plot the 5 most common diagnoses except B50, B51, B52, B53, B54 and A97 in a bar graph."
Question_B_5 = "Count the number of patients that got diagnosed with both (B50, B51, B52, B53 or B54) and also A97."
Question_B_6 = "Plot the gender distribution of all patients that got diagnosed with B50, B51, B52, B53, B54 in a pie chart, include percentages. Then, repeat for all patients that got diagnosed with A97"
Question_B_7 = "Plot the age of all patients that got diagnosed with B50, B51, B52, B53, B54 in a bar chart. Then repeat for all patients that got diagnosed with A97"

## H36

In [8]:
Question_H_1 = "How many patients are there?"
Question_H_2 = "Plot the gender distribution in a pie chart, include percentages."
Question_H_3 = "Plot a bar graph for the number of patients per age. Remember to only count each patient/MPINumber once."
Question_H_4 = "Count the number of patients for each unique diagnosis. Plot the 5 most common diagnoses except H36 in a bar graph. "
Question_H_5 = "How many HBA1CNV lab results does each patient have? Plot the number of patients that had each number of lab results in a bar graph."
Question_H_6 = "Count HBA1CNV lab results for each patient. Plot the measurement values for patients with exactly 5 HBA1CNV lab results in boxplots. Then plot dotted horizontal lines for the values 4.5, 5.7., 6.5. and 7.5."
Question_H_7 = "Count HBA1CNV lab results for each patient. Plot the measurement values for patients with exactly 5 HBA1CNV lab results over time. Then plot dotted horizontal lines for the values 4.5, 5.7., 6.5. and 7.5."

## German

In [9]:
Question_H_1_German = "Wie viele Patienten gibt es?"
Question_H_2_German = "Zeichne die Geschlechterverteilung in ein Tortendiagramm, beziehe Prozentzahlen mit ein."
Question_H_3_German = "Zeichne ein Balkendiagramm for die Anzahl an Patienten pro Alter. Denke daran jeden Patienten/MPINumber nur einmal zu zählen."
Question_H_4_German = "Zähle die Anzahl der Patienten für jede einzigartige Diagnose. Zeichne die 5 häufigsten Diagnoses ausser H36 in einem Balkendiagramm."
Question_H_5_German = "Wie viele HBA1CNV Laborwerte hat jeder Patient? Zeichne die Anzahl an Patienten die die jeweilige Anzahl an Laborwerte hatten in einem Balkendiagramm."
Question_H_6_German = "Zähle die HBA1CNV Laborwerte für jeden Patient. Zeichne die Messwerte für Patienten mit exakt 5 HBA1CNV Laborwerten in einem Balkendiagramm. Zeichne dann gepunktete horizontale Linien für die Werte 4.5, 5.7, 6.5 und 7.5."
Question_H_7_German = "Zähle die HBA1CNV Laborwerte für jeden Patient. Zeichne die Messwerte für Patienten mit exakt 5 HBA1CNV Laborwerten über die Zeit. Zeichne dann gepunktete horizontale Linien für die Werte 4.5, 5.7, 6.5 und 7.5."

## Reproducibility

### Phrasing - Keyword

In [10]:
Question_H_1_Keyword = "Number of patients."
Question_H_2_Keyword = "Gender distribution in pie graph."
Question_H_3_Keyword = "Age distribution in bar graph."
Question_H_4_Keyword = "Five most common diagnoses besides H36 in bar graph"
Question_H_5_Keyword = "HBA1CNV lab results per patient"
Question_H_6_Keyword = "HBA1CNV values for patients with 5 HBA1CNV lab results in boxplot."
Question_H_7_Keyword = "HBA1CNV values for patients with 5 HBA1CNV values over time."

###  Phrasing - Short

In [11]:
Question_H_1_Short = "How many different patients are there?"
Question_H_2_Short = "Plot a gender pie chart with percentages."
Question_H_3_Short = "Plot an age bar graph."
Question_H_4_Short = "Plot a boxplot for the five side diagnoses besides H36."
Question_H_5_Short = "Plot how many patients had how many HBA1CNV lab results."
Question_H_6_Short = "Plot the HBA1CNV values for patients with 5 HBA1CNV lab results in a boxplot."
Question_H_7_Short = "Plot the HBA1CNV values for patients with 5 HBA1CNV lab results over time."

### Phrasing - Data science

In [12]:
Question_H_1_Data_science = "How many different patients does the dataset contain?"
Question_H_2_Data_science = "Plot the distribution of gender for all patients in a pie chart. Include percentages in the pie chart."
Question_H_3_Data_science = "Plot the age of the patients in a bar graph, one bar for each age with the number of patients with that age as the height."
Question_H_4_Data_science = "How often does each diagnosis occur in the dataset? Plot the number of occurences for the five most common diagnoses in a bar graph while excluding the diagnosis H36 "
Question_H_5_Data_science = "How often did each patient get a HBA1CNV measurement? Plot the number of measuremnts in a bar graph, with the number of patients that had that exact number of measurements as the height."
Question_H_6_Data_science = "How often did each patient get a HBA1CNV measurement? Plot the measurement values for every patient with exactly 5 lab results in seperate boxplots. Then add dotted horizontal lines at the heights 4.5, 5.7, 6.5 and 7.5 to the graph."
Question_H_7_Data_science = "How often did each patient get a HBA1CNV measurement? Plot the measurement values for every patient with exactly 5 lab results as a line graph over time. Then add dotted horizontal lines at the heights 4.5, 5.7, 6.5 and 7.5 to the graph."

### Phrasing - Medical

In [13]:
Question_H_1_Medical = "On how many patients does data exist?"
Question_H_2_Medical = "Show me the ratio between male and female patients in a round graph, including percentages."
Question_H_3_Medical = "Show me the age of all patients in a graph."
Question_H_4_Medical = "What were the five most common side diagnoses besides Retinopathia diabetica? Show me their number in a graph."
Question_H_5_Medical = "How often did each patient get their blood sugar measured in the lab? Show me how many patients got how many blood sugar lab results."
Question_H_6_Medical = "Show me the blood sugar values for patients that got exactly five blood sugar lab results."
Question_H_7_Medical = "Show me how the blood sugar values change over time for patients that got exactly five blood sugar lab results."

# Prompting templates

In [14]:
def query_template_zero(df: pd.DataFrame, query: str, tokenizer):
    llm_chat = [
        {
            "role": "user",
            "content": f"""
Your job is to always write python code for the given dataframe `df` using pandas, numpy and matplot library.
Never execute the code.
            
The first twenty rows of the dataframe are:
{df.head(20)}
            
The columns of the dataframe are:
{df.columns}

Every patient has a unique `MPINumber`.         
"""
        },
        
        {
            "role": "assistant",
            "content": """
"""
        },
        
        {
            "role": "user",
            "content": query
        }
    ]

    return tokenizer.apply_chat_template(llm_chat, add_generation_prompt=True, tokenize=False)

In [15]:
def query_template_zero_chain(df: pd.DataFrame, query: str, tokenizer):
    llm_chat = [
        {
            "role": "user",
            "content": f"""
Always think step by step.
Your job is to always write python code for the given dataframe `df` using pandas, numpy and matplot library.
Never execute the code.
            
The first twenty rows of the dataframe are:
{df.head(20)}
            
The columns of the dataframe are:
{df.columns}

Every patient has a unique `MPINumber`.         
"""
        },
        
        {
            "role": "assistant",
            "content": """
"""
        },
        
        {
            "role": "user",
            "content": query
        }
    ]

    return tokenizer.apply_chat_template(llm_chat, add_generation_prompt=True, tokenize=False)

In [16]:
def query_template_one(df: pd.DataFrame, query: str, tokenizer):
    llm_chat = [
        {
            "role": "user",
            "content": f"""
Your job is to always write python code for the given dataframe `df` using pandas, numpy and matplot library.
Never execute the code.
            
The first twenty rows of the dataframe are:
{df.head(20)}
            
The columns of the dataframe are:
{df.columns}

Every patient has a unique `MPINumber`.          
"""
        },
        
        {
            "role": "assistant",
            "content": """
"""
        },
        
        {
            "role": "user",
            "content": """
Count the number of patients.
Then, count the number of diagnoses. 
Then, count the number of patients with a X00 diagnosis.
"""
        },
        
        {
            "role": "assistant",
            "content": """
```
# Number of patients
print(len(df['MPINumber'].unique()))
``` 

```
# Count diagnoses
df_diagnoses = df[df['Finding'] == 'Diagnose']
print(len(df_diagnoses))
```

```
# Count X00 patients
df_X00 = df[(df['Finding'].str.contains('Diagnose')) & (df['Value'].str.contains('X00'))]['MPINumber'].unique()
print(len(df_X00))
```            
"""
        },
        
        {
            "role": "user",
            "content": query
        }
    ]

    return tokenizer.apply_chat_template(llm_chat, add_generation_prompt=True, tokenize=False)

In [17]:
def query_template_one_chain(df: pd.DataFrame, query: str, tokenizer):
    llm_chat = [
        {
            "role": "user",
            "content": f"""
Always think step by step.            
Your job is to always write python code for the given dataframe `df` using pandas, numpy and matplot library.
Never execute the code.
            
The first twenty rows of the dataframe are:
{df.head(20)}
            
The columns of the dataframe are:
{df.columns}

Every patient has a unique `MPINumber`.          
"""
        },
        
        {
            "role": "assistant",
            "content": """
"""
        },
        
        {
            "role": "user",
            "content": """
Count the number of patients.
Then, count the number of diagnoses. 
Then, count the number of patients with a X00 diagnosis.
"""
        },
        
        {
            "role": "assistant",
            "content": """
```
# Number of patients
print(len(df['MPINumber'].unique()))
``` 

```
# Count diagnoses
df_diagnoses = df[df['Finding'] == 'Diagnose']
print(len(df_diagnoses))
```

```
# Count X00 patients
df_X00 = df[(df['Finding'].str.contains('Diagnose')) & (df['Value'].str.contains('X00'))]['MPINumber'].unique()
print(len(df_X00))
```            
"""
        },
        
        {
            "role": "user",
            "content": query
        }
    ]

    return tokenizer.apply_chat_template(llm_chat, add_generation_prompt=True, tokenize=False)

In [18]:
def query_template_few(df: pd.DataFrame, query: str, tokenizer):
    llm_chat = [
                {
            "role": "user",
            "content": f"""
Your job is to always write python code for the given dataframe `df` using pandas, numpy and matplot library.
Never execute the code.
            
The first twenty rows of the dataframe are:
{df.head(20)}
            
The columns of the dataframe are:
{df.columns}

Every patient has a unique `MPINumber`.          
"""
        },
        
        {
            "role": "assistant",
            "content": """
"""
        },
        
        {
            "role": "user",
            "content": """
Count the number of patients.
Then, count the number of diagnoses. 
Then, count the number of patients with a X00 diagnosis.
"""
        },
        
        {
            "role": "assistant",
            "content": """
```
# Number of patients
print(len(df['MPINumber'].unique()))
``` 

```
# Count diagnoses
df_diagnoses = df[df['Finding'] == 'Diagnose']
print(len(df_diagnoses))
```

```
# Count X00 patients
df_X00 = df[(df['Finding'].str.contains('Diagnose')) & (df['Value'].str.contains('X00'))]['MPINumber'].unique()
print(len(df_X00))
```            
"""
        },
        
        {
            "role": "user",
            "content": """
Plot patients with a Y123 lab result against patients without one in a pie chart. Include percentages and a legend.       
"""            
        },
        
        {
            "role": "assistant",
            "content": """ 
```
# Filter dataframe for patients with and without 'Y123' lab result
df_Y123 = df[(df['Source'].str.contains('Labor')) & (df['Finding'].str.contains('Y123'))]['MPINumber'].unique()
df_no_Y123 = df[~df['MPINumber'].isin(df_Y123)]

# Count number of patients in each group
num_Y123 = len(df_Y123)
num_no_Y123 = len(df_no_Y123)

# Plot pie chart
import matplotlib.pyplot as plt

labels = ['Patients with Y123 lab result', 'Patients without Y123 lab result']
sizes = [num_Y123, num_no_Y123]

plt.pie(sizes, labels=labels, startangle=90, autopct='%1.1f%%')
plt.axis('equal')
plt.title('Patients with and without Y123 lab result')
plt.legend(title='Patients', loc='upper right')
plt.show()
``` 
"""
        },
                
        {
            "role": "user",
            "content": """
Which 5 patients had the most Y123 lab result? Plot the values for the patient with the most results in a boxplot.     
"""            
        },
        
        {
            "role": "assistant",
            "content": """
```
# Filter for patients with y123 lab result             
df_Y123 = df[(df['Source'].str.contains('Labor')) & (df['Finding'].str.contains('Y123'))]

# Group by 'MPINumber' and count the occurrences of 'Y123' lab results
df_Y123_counts = df_Y123.groupby('MPINumber').size().reset_index(name='Count')

# Sort by count in descending order and select the top 5 patients
top_5_Y123 = df_Y123_counts.sort_values(by='Count', ascending=False).head(5)

# Display the top 5 patients with the most Y123 lab results
print("Top 5 patients with the most Y123 lab results:")
print(top_5_Y123)


# Extract the MPINumber of the patient with the most Y123 lab results
top_patient = top_5_Y123.iloc[0]['MPINumber']

# Filter the dataframe for this patient's Y123 lab results
df_top_patient_Y123 = df_Y123[df_Y123['MPINumber'] == top_patient]

# Ensure 'Lab Value' column contains only numeric data
df_top_patient_Y123['Value'] = pd.to_numeric(df_top_patient_Y123['Value'], errors='coerce')

# Drop rows with NaN values in 'Lab Value' column
df_top_patient_Y123 = df_top_patient_Y123.dropna(subset=['Value'])


# Plot the lab values for the patient with the most Y123 lab results
plt.figure(figsize=(10, 6))
plt.boxplot(df_top_patient_Y123['Value'])
plt.xlabel('Lab Value')
plt.title(f'Boxplot of Lab Values for Patient {top_patient}')
plt.show()
```
"""
        },         
        
        {
            "role": "user",
            "content": query
        }
    ]

    return tokenizer.apply_chat_template(llm_chat, add_generation_prompt=True, tokenize=False)

In [19]:
def query_template_few_chain(df: pd.DataFrame, query: str, tokenizer):
    llm_chat = [
                {
            "role": "user",
            "content": f"""
Always think step by step.
Your job is to always write python code for the given dataframe `df` using pandas, numpy and matplot library.
Never execute the code.
            
The first twenty rows of the dataframe are:
{df.head(20)}
            
The columns of the dataframe are:
{df.columns}

Every patient has a unique `MPINumber`.          
"""
        },
        
        {
            "role": "assistant",
            "content": """
"""
        },
        
        {
            "role": "user",
            "content": """
Count the number of patients.
Then, count the number of diagnoses. 
Then, count the number of patients with a X00 diagnosis.
"""
        },
        
        {
            "role": "assistant",
            "content": """
```
# Number of patients
print(len(df['MPINumber'].unique()))
``` 

```
# Count diagnoses
df_diagnoses = df[df['Finding'] == 'Diagnose']
print(len(df_diagnoses))
```

```
# Count X00 patients
df_X00 = df[(df['Finding'].str.contains('Diagnose')) & (df['Value'].str.contains('X00'))]['MPINumber'].unique()
print(len(df_X00))
```            
"""
        },
        
        {
            "role": "user",
            "content": """
Plot patients with a Y123 lab result against patients without one in a pie chart. Include percentages and a legend.       
"""            
        },
        
        {
            "role": "assistant",
            "content": """ 
```
# Filter dataframe for patients with and without 'Y123' lab result
df_Y123 = df[(df['Source'].str.contains('Labor')) & (df['Finding'].str.contains('Y123'))]['MPINumber'].unique()
df_no_Y123 = df[~df['MPINumber'].isin(df_Y123)]

# Count number of patients in each group
num_Y123 = len(df_Y123)
num_no_Y123 = len(df_no_Y123)

# Plot pie chart
import matplotlib.pyplot as plt

labels = ['Patients with Y123 lab result', 'Patients without Y123 lab result']
sizes = [num_Y123, num_no_Y123]

plt.pie(sizes, labels=labels, startangle=90, autopct='%1.1f%%')
plt.axis('equal')
plt.title('Patients with and without Y123 lab result')
plt.legend(title='Patients', loc='upper right')
plt.show()
``` 
"""
        },
                
        {
            "role": "user",
            "content": """
Which 5 patients had the most Y123 lab result? Plot the values for the patient with the most results in a boxplot.     
"""            
        },
        
        {
            "role": "assistant",
            "content": """
```
# Filter for patients with y123 lab result             
df_Y123 = df[(df['Source'].str.contains('Labor')) & (df['Finding'].str.contains('Y123'))]

# Group by 'MPINumber' and count the occurrences of 'Y123' lab results
df_Y123_counts = df_Y123.groupby('MPINumber').size().reset_index(name='Count')

# Sort by count in descending order and select the top 5 patients
top_5_Y123 = df_Y123_counts.sort_values(by='Count', ascending=False).head(5)

# Display the top 5 patients with the most Y123 lab results
print("Top 5 patients with the most Y123 lab results:")
print(top_5_Y123)


# Extract the MPINumber of the patient with the most Y123 lab results
top_patient = top_5_Y123.iloc[0]['MPINumber']

# Filter the dataframe for this patient's Y123 lab results
df_top_patient_Y123 = df_Y123[df_Y123['MPINumber'] == top_patient]

# Ensure 'Lab Value' column contains only numeric data
df_top_patient_Y123['Value'] = pd.to_numeric(df_top_patient_Y123['Value'], errors='coerce')

# Drop rows with NaN values in 'Lab Value' column
df_top_patient_Y123 = df_top_patient_Y123.dropna(subset=['Value'])


# Plot the lab values for the patient with the most Y123 lab results
plt.figure(figsize=(10, 6))
plt.boxplot(df_top_patient_Y123['Value'])
plt.xlabel('Lab Value')
plt.title(f'Boxplot of Lab Values for Patient {top_patient}')
plt.show()
```
"""
        },         
        
        {
            "role": "user",
            "content": query
        }
    ]

    return tokenizer.apply_chat_template(llm_chat, add_generation_prompt=True, tokenize=False)

# Queries - Testing


In [20]:
df_P

Unnamed: 0,Source,CollectedDT,Finding,Value,AdditionalInformation,MPINumber,Age


### Choosing dataframe

In [21]:
# Choose your dataframe for testing
df = df_P.copy()


### Backups in case the generated code modifies the data frame

In [22]:
dfbackup = df.copy()

In [23]:
df = dfbackup.copy()

# Start testing 


### Coose your prompting template and your question

In [24]:
Template = query_template_zero

Question = Question_H_1

In [25]:
Presence_penalty = 0.8
Repetition_penalty = 0.8
Temperature = 0.1
Top_p = 0.1
Max_tokens = 1024

In [26]:
query = Template(df, Question, tokenizer)
llm_outputs = llm.generate(query, SamplingParams
                           (presence_penalty=Presence_penalty, 
                            repetition_penalty=Repetition_penalty, 
                            temperature=Temperature, 
                            top_p=Top_p, 
                            max_tokens=Max_tokens))
for i, out in enumerate(llm_outputs):
    print('Output:')
    print(out.outputs[0].text)
    print('#########')

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it, est. speed input: 88.96 toks/s, output: 57.00 toks/s]

Output:
 To find the number of patients, you can use the `shape` property of a pandas DataFrame, which returns a tuple of the number of rows and columns:

```python
# Get the number of rows in the DataFrame
num_patients = df.shape[0]
```

So the code to find the number of patients in the given DataFrame `df` is:

```python
num_patients = df.shape[0]
```
#########





## Code Testing - Space to test generated code

In [27]:
print("hello")

hello
