In [17]:
import os
import pandas as pd
import torch
from datasets import load_dataset
# from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

# from vllm import LLM, SamplingParams
import numpy as np

import json
import tqdm
from time import sleep

## Model Preparation

In [2]:
model_path = "/data/project/Qwen2.5-7B-Instruct-1M"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


## Data Preparation

In [3]:
df_raw = pd.read_excel("../Dataset/MergedDataset_231207_ForElly_Excel.xlsx")

# Remove "." in the feature column
row_num, column_num = df_raw.shape
for rowID in range(row_num):
    for columnID in range(column_num):
        if "." == df_raw.iloc[rowID, columnID]:
            df_raw.iloc[rowID, columnID] = np.nan

# prepare the feature list
featureList = []
for item in df_raw.columns:
    if item.startswith("Item"):
        featureList.append(item)

patientList = []
roundList = []
for idx, row in df_raw.iterrows():
    patient = int(str(row["IAPTus_Num"]).split("_")[0])
    if "_" in str(row["IAPTus_Num"]):
        Round = int(str(row["IAPTus_Num"]).split("_")[1])
    else:
        Round = 0
    patientList.append(patient)
    roundList.append(Round)
df_raw["patient"] = patientList
df_raw["round"] = roundList

In [4]:
df_referral = pd.read_json("../Dataset/ReferralLetterSummary.json", lines=True)

In [5]:
df = pd.merge(df_raw, df_referral, on=["patient","round"])

In [6]:
df = df[df["RecoveryDesc"] == df["RecoveryDesc"]]
df = df[(df["RecoveryDesc"].isin(["At recovery", "Not at recovery"]))]
df.reset_index().drop("index", axis=1)
print(df.shape)

labelList = []
for idx, row in df.iterrows():
    if (row["ReliableChangeDesc"] == "Reliable improvement") & (row["ReliableRecoveryDesc"] == "Reliable recovery") & (row["RecoveryDesc"] == "At recovery"):
        labelList.append(1)
    else:
        labelList.append(0)
df["labels"] = np.array(labelList, dtype=float)
labelList = ["labels"]

(43, 285)


In [7]:
df = df[["IAPTus_Num_x","patient","round","text", "summarization", "labels"]]
df = df.reset_index().drop("index", axis=1)

In [8]:
df

Unnamed: 0,IAPTus_Num_x,patient,round,text,summarization,labels
0,24804,24804,0,ASSESSMENT APPOINTMENT:\n\nStudent confirmed t...,"### Patient Background:\nNAME, a university st...",1.0
1,24813,24813,0,Assessment appointment\n\nRisk Assessment\n\nC...,"### Patient Background:\nNAME, a university st...",0.0
2,24816,24816,0,Student agrees with “Information for Students”...,### Patient Background:\nNAME is a university ...,0.0
3,24822_1,24822,1,ASSESSMENT APPOINTMENT:\n\nStudent confirmed t...,"### Summary of Patient Background, Symptoms, I...",0.0
4,24822_2,24822,2,Current thoughts/plans/intent to self-harm\n\n...,"**Patient Background:**\nNAME, a university st...",0.0
5,24828,24828,0,Assessment appointment\n\nExploration of CORE ...,"### Patient Background, Symptoms, Impacts, and...",0.0
6,24837,24837,0,Assessment appointment\n\nExploration of MDS s...,"### Summary of Patient Background, Symptoms, I...",1.0
7,24840,24840,0,[<50 min assessment included confidentiality l...,**Patient Background:**\nNAME has a history of...,1.0
8,24861,24861,0,Exploration of MDS scores and risk\nPHQ9 -16\n...,### Patient Background:\n- **Name**: NAME\n- *...,1.0
9,24864,24864,0,Teams AA Appointmen\n\n\nStudent agrees with “...,### Patient Background:\nNAME is a university ...,0.0


## Prompt Preparation

In [9]:
# Recovery Prediction 0.46511627906976744
prompt_template = (
    "Based on the content of the referral letter, "
    "please predict whether the patient will recover after receiving treatment "
    "(their symptoms reduced by more than a specific amount). \n\n"
    "Please just use 0/1 to indicate if the patient recover or not recover. \n\n"
    "Here are four demonstrations:\n{demoAll}"
    "Now please make the prediction based on the following referral letter.\n\n"
    "Referral Letter:\n"
    "{rf}\n\n"
    "Prediction:\n"
)

In [10]:
def genDemo(df):
    demo_template = (
        "Referral Letter:\n"
        "{demo}\n\n"
        "Prediction:\n"
        "{demo_label}\n\n"
    )
    demoList = []
    df = df.reset_index().drop("index", axis=1)
    for idx, row in df.iterrows():
        thisDemo = demo_template.format(
            demo = row["summarization"],
            demo_label = int(row["labels"])
        )
        demoList.append(thisDemo)
    return demoList

In [11]:
def genPrompt(df, rf):
    demoList = genDemo(df)
    prompt = prompt_template.format(
        demoAll="".join(demoList),
        rf = rf
    )
    return prompt

In [13]:
df_sample = df.sample(n=4)
demoList = genDemo(df_sample)
print("".join(demoList))

Referral Letter:
### Summary of Referral Letter

**Patient Background:**
- Diagnosed with depression and generalized anxiety disorder.
- Experienced grief after the passing of two relatives from the same condition.
- Started university but struggled with anxiety and panic attacks.
- Has a history of self-harming behaviors, including cutting and making oneself vomit.
- Currently living in a flat where she does not socialize much with other residents.

**Symptoms:**
- Frequent panic attacks and increased intensity over the past two months.
- Self-reported "shaky and twitchy" state post-panic attacks.
- Thoughts of self-harm (e.g., breaking fingers, cutting) to release pressure.
- Current coping mechanisms include speaking to boyfriend, using Propanol, and speaking to friends and family.

**Impacts:**
- Daily life is described as "debilitating."
- Sleep issues, eating disorders (recently stopped vomiting), and difficulty maintaining motivation for studies.
- Struggles with completing assi

In [14]:
print(genPrompt(df_sample, df.iloc[0]["summarization"]))

Based on the content of the referral letter, please predict whether the patient will recover after receiving treatment (their symptoms reduced by more than a specific amount). 

Please just use 0/1 to indicate if the patient recover or not recover. 

Here are four demonstrations:
Referral Letter:
### Summary of Referral Letter

**Patient Background:**
- Diagnosed with depression and generalized anxiety disorder.
- Experienced grief after the passing of two relatives from the same condition.
- Started university but struggled with anxiety and panic attacks.
- Has a history of self-harming behaviors, including cutting and making oneself vomit.
- Currently living in a flat where she does not socialize much with other residents.

**Symptoms:**
- Frequent panic attacks and increased intensity over the past two months.
- Self-reported "shaky and twitchy" state post-panic attacks.
- Thoughts of self-harm (e.g., breaking fingers, cutting) to release pressure.
- Current coping mechanisms includ

## Inference

In [24]:
responseList = []
for idx, row in tqdm.tqdm(df.iterrows(), total = df.shape[0]):
    df_sample = df.sample(4)
    while idx in df_sample.index:
        df_sample = df.sample(4)
    prompt = genPrompt(df_sample, row["summarization"])
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    # print(prompt)
    # sleep(10)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=8
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    responseList.append(response)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 43/43 [00:46<00:00,  1.09s/it]


In [25]:
responseList

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0']

In [26]:
df["labels"]

0     1.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     1.0
7     1.0
8     1.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
16    0.0
17    0.0
18    0.0
19    1.0
20    1.0
21    1.0
22    0.0
23    0.0
24    0.0
25    1.0
26    1.0
27    0.0
28    1.0
29    0.0
30    1.0
31    1.0
32    1.0
33    0.0
34    1.0
35    1.0
36    1.0
37    1.0
38    1.0
39    1.0
40    1.0
41    0.0
42    1.0
Name: labels, dtype: float64

In [33]:
accuracy_score([int(x) for x in df["labels"].tolist()], [int(x) for x in responseList])

0.46511627906976744

# Useless codes, just for testing

In [21]:
prompt_template = (
    "Based on the content of the referral letter, "
    "please predict whether the patient will recover after receiving treatment "
    "(their symptoms reduced by more than a specific amount). \n\n"
    "Please just use 0/1 to indicate if the patient recover or not recover. \n\n"
    "Here are four demonstrations:\n{demoAll}"
    "Now please make the prediction based on the following referral letter.\n\n"
    "Referral Letter:\n"
    "{rf}\n\n"
    "Prediction:\n"
)

In [10]:
df_sample = df.sample(1)[["summarization", "labels"]]

In [11]:
prompt = prompt_template.format(
    demo = df_sample.to_dict()["summarization"][df_sample.index.tolist()[0]], 
    demo_label=int(df_sample.to_dict()["labels"][df_sample.index.tolist()[0]]),
    rf = df.iloc[0]["summarization"]
)
print(prompt)

Based on the content of the referral letter, please predict whether the patient will recover after receiving treatment (their symptoms reduced by more than a specific amount). 

Please just use 0/1 to indicate if the patient recover or not recover. 

Here is a demonstration:
Referral Letter:
### Patient Background:
- **Name:** NAME
- **Age:** Early 20s
- **Medical History:** Diagnosed with depression, generalised anxiety disorder, and possible compulsion disorder.
- **Family History:** Loss of two relatives to the same condition, which has significantly impacted her.
- **Current Status:** Living in a flat with limited interaction with flatmates; attending university but struggling with anxiety and sleep issues.

### Symptoms:
- **Panic Attacks:** Increasing in frequency.
- **Self-Harm Thoughts:** Including pressing fingers and toes against surfaces, cutting, and throwing up (recently stopped).
- **Overthinking:** Feels distant from people, self-conscious, and struggles with acceptance.

In [13]:
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
# print(messages)

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=64
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

1


In [21]:
df_sample = df.sample(1)[["summarization", "labels"]]

In [25]:
df_sample.index.tolist()[0]

38

In [28]:
df_sample.to_dict()["summarization"][df_sample.index.tolist()[0]]

'### Patient Background:\n- **Age**: Likely early 20s.\n- **Medical History**: Diagnosed with depression, generalized anxiety disorder, and possible compulsive disorder related to anxiety.\n- **Family History**: Loss of two relatives to a condition (likely severe).\n- **Recent Events**: Started vomiting compulsively 3 months ago, which she has since stopped.\n\n### Symptoms:\n- **Panic Attacks**: Increasing in frequency.\n- **Self-Harm Thoughts**: Including pressing hard against surfaces, cutting thoughts, and harming fingers and toes.\n- **Vomiting**: Compulsive behavior triggered by physical illness.\n- **Sleep Issues**: Difficulty sleeping until late afternoon, sometimes skipping meals.\n- **Social Isolation**: Avoided socializing during university, leading to anxiety and stress.\n\n### Impacts:\n- **Daily Life**: Debilitating; struggles with motivation and completing tasks.\n- **Studies**: Missed university due to anxiety and sleep issues; struggling with motivation and procrastina

In [37]:
int(df_sample.to_dict()["labels"][df_sample.index.tolist()[0]])

1

In [31]:
df.iloc[0]["summarization"]

'### Patient Background:\nNAME, a university student, has experienced anxiety and depression since Year 9, with episodes intensifying over time. She has a history of suicidal thoughts, most recently in September/October 2022, and is currently dealing with panic attacks, self-harming thoughts, and a compulsive behavior of making herself vomit, which she has since discontinued.\n\n### Symptoms:\n- **Panic Attacks:** Increasing in frequency.\n- **Self-Harming Thoughts:** To break fingers/toes or cut herself, though she hasn’t acted on these thoughts.\n- **Vomiting Compulsion:** Recently stopped, but triggered by physical illness and ongoing psychological distress.\n- **Generalized Anxiety Disorder (GAD):** Diagnosed.\n- **Depression:** Diagnosed.\n- **Compulsion Disorder:** Possible, related to anxiety post-panic attacks.\n\n### Impacts:\n- **Daily Life:** Debilitating, with sleep issues, low motivation, and difficulty managing studies.\n- **Studies:** Missed university due to anxiety and

In [36]:
int(df.iloc[0]["labels"])

1

In [2]:
model_path = "/data/project/Qwen2.5-7B-Instruct-1M"

In [3]:
model_path = "/data/project/Qwen2.5-7B-Instruct-1M"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
prompt_template = (
    "Please summarize this referral letter and point out the patient's backgrounds, symptoms, impacts, and goals. "
    "Your reply should be within 512 characters. \n{}")

RLList = df["text"].tolist()
responseList = []
for item in tqdm.tqdm(RLList):
    prompt = prompt_template.format(row["text"])

    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    # print(messages)

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    responseList.append(response)