# This script is used to shorten the referral letter(generate summarization of referral letter using QWen2.5-7B)

In [1]:
import os
import pandas as pd
# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
# sklearn.metrics.accuracy_score
# import xgboost as xgb
import numpy as np
import itertools 
import tqdm
from time import sleep
import random
# from docx import Document
import docx2txt

## Data Preparation

In [2]:
df_raw = pd.read_excel("../Dataset/MergedDataset_231207_ForElly_Excel.xlsx")

# Remove "." in the feature column
row_num, column_num = df_raw.shape
for rowID in range(row_num):
    for columnID in range(column_num):
        if "." == df_raw.iloc[rowID, columnID]:
            df_raw.iloc[rowID, columnID] = np.nan

# prepare the feature list
featureList = []
for item in df_raw.columns:
    if item.startswith("Item"):
        featureList.append(item)

In [3]:
patientList = []
roundList = []
for idx, row in df_raw.iterrows():
    patient = str(row["IAPTus_Num"]).split("_")[0]
    if "_" in str(row["IAPTus_Num"]):
        Round = int(str(row["IAPTus_Num"]).split("_")[1])
    else:
        Round = 0
    patientList.append(patient)
    roundList.append(Round)
df_raw["patient"] = patientList
df_raw["round"] = roundList

In [4]:
df_raw

Unnamed: 0.1,Unnamed: 0,IAPTus_Num,Referral Date,Age_ReferralRequest_ReceivedDate,EthnicDescGroupCode,EthnicCategoryGroupShortCode,GenderIdentity,SexualOrientationDesc,EndDesc,EndDescGroupShort,...,Item218,Item219,Item220,Item221,Item222,Item223,Item224,Item225,patient,round
0,1,24475,2018-09-08,5.099020,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,,,,,,,,,24475,0
1,2,24476_1,2019-04-10,4.358899,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,,,,,,,,,24476,1
2,3,24476_2,2021-03-28,4.582576,1.0,1.0,2,,Termition of treatment earlier than Care Profe...,Seen and treated,...,1,1,1,0,0,0,0,0,24476,2
3,4,24479,2019-04-16,4.898979,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,,,,,,,,,24479,0
4,5,24482,2018-09-07,0.000000,2.0,2.0,2,,Termition of treatment earlier than Care Profe...,Seen and treated,...,,,,,,,,,24482,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,724,24706,NaT,0.000000,,,,,,,...,,,,,,,,,24706,0
724,725,24721,NaT,0.000000,,,,,,,...,,,,,,,,,24721,0
725,727,24744,NaT,0.000000,,,,,,,...,,,,,,,,,24744,0
726,730,24658,NaT,0.000000,,,,,,,...,,,,,,,,,24658,0


## Referral Letter Preparation. Here I first prepare a ID-path mapping, then use docx2txt to extract all text content from docx file

In [5]:
RFLetterList = []
for file in os.listdir("../Dataset/ReferralLetter/Anonymous data/"):
    if file.endswith("docx"):
        #print(file)
        patient = file.split("_referral")[0]
        itemList = file.split("_referral")[-1].split(".docx")
        if itemList[0] == "":
            RFRound = 0
        else:
            RFRound = int(itemList[0])

        path = os.path.join("/data/project/myworkspace/LOD", "Dataset/ReferralLetter/Anonymous data/", file)
        dic = {}
        dic["patient"] = patient
        dic["round"] = RFRound
        dic["path"] = path
        RFLetterList.append(dic)
print(len(RFLetterList))

47


In [6]:
df_referal = pd.DataFrame(RFLetterList)

In [7]:
df_referal

Unnamed: 0,patient,round,path
0,24885,2,/data/project/myworkspace/LOD/Dataset/Referral...
1,24864,0,/data/project/myworkspace/LOD/Dataset/Referral...
2,24804,0,/data/project/myworkspace/LOD/Dataset/Referral...
3,25866,0,/data/project/myworkspace/LOD/Dataset/Referral...
4,25009,0,/data/project/myworkspace/LOD/Dataset/Referral...
5,25093,0,/data/project/myworkspace/LOD/Dataset/Referral...
6,24840,0,/data/project/myworkspace/LOD/Dataset/Referral...
7,24837,0,/data/project/myworkspace/LOD/Dataset/Referral...
8,25884,0,/data/project/myworkspace/LOD/Dataset/Referral...
9,25114,0,/data/project/myworkspace/LOD/Dataset/Referral...


## Merge these two Dataframes

In [8]:
df = pd.merge(df_raw, df_referal, on=["patient", "round"])

In [9]:
df = df[df["RecoveryDesc"] == df["RecoveryDesc"]]
df = df[(df["RecoveryDesc"].isin(["At recovery", "Not at recovery"]))]
df.reset_index().drop("index", axis=1)
print(df.shape)

labelList = []
for idx, row in df.iterrows():
    if (row["ReliableChangeDesc"] == "Reliable improvement") & (row["ReliableRecoveryDesc"] == "Reliable recovery") & (row["RecoveryDesc"] == "At recovery"):
        labelList.append(1)
    else:
        labelList.append(0)
df["label"] = labelList
labelList = ["label"]

(43, 282)


In [10]:
df

Unnamed: 0.1,Unnamed: 0,IAPTus_Num,Referral Date,Age_ReferralRequest_ReceivedDate,EthnicDescGroupCode,EthnicCategoryGroupShortCode,GenderIdentity,SexualOrientationDesc,EndDesc,EndDescGroupShort,...,Item220,Item221,Item222,Item223,Item224,Item225,patient,round,path,label
0,115,24804,2020-05-10,4.472136,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,1,0,0,0,0,0,24804,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
1,118,24813,2020-05-11,4.582576,1.0,1.0,1,,Termition of treatment earlier than Care Profe...,Seen and treated,...,1,1,0,1,0,0,24813,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
2,119,24816,2020-05-19,4.472136,1.0,1.0,1,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,0,0,0,0,0,24816,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
3,121,24822_1,2020-05-24,4.582576,6.0,3.0,2,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,1,0,0,0,0,24822,1,/data/project/myworkspace/LOD/Dataset/Referral...,0
4,122,24822_2,2021-04-19,4.690416,6.0,3.0,2,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,1,0,0,0,0,24822,2,/data/project/myworkspace/LOD/Dataset/Referral...,0
5,124,24828,2020-05-25,5.0,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,1,0,1,0,0,24828,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
6,127,24837,2020-05-25,4.472136,1.0,1.0,1,,Mutually agreed completion of treatment,Seen and treated,...,1,1,0,0,0,0,24837,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
7,128,24840,2020-05-25,4.582576,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,1,0,0,24840,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
8,136,24861,2020-06-04,4.472136,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,1,0,0,0,0,24861,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
9,137,24864,2020-06-01,4.358899,1.0,1.0,2,,Termition of treatment earlier than Care Profe...,Seen and treated,...,0,0,0,0,0,0,24864,0,/data/project/myworkspace/LOD/Dataset/Referral...,0


## Extract the docx's content

In [11]:
contentList = []
for idx, row in df.iterrows():
    text = docx2txt.process(row["path"])
    contentList.append(text)
df["text"] = contentList

## Analyse the content length

In [12]:
# Analysis the length
lengthList = []
for idx, row in df.iterrows():
    lengthList.append(len(row["text"]))
print(lengthList)
print(np.mean(lengthList))
print(np.std(lengthList))

[4945, 2652, 5458, 4711, 4108, 5354, 3680, 5255, 3544, 9600, 4514, 4870, 2270, 7096, 4570, 2510, 4984, 3683, 3860, 2046, 2898, 4318, 3580, 5287, 2510, 5167, 2475, 4123, 5228, 5015, 2889, 3988, 5459, 3124, 3653, 3459, 2587, 3296, 5642, 4437, 2158, 5312, 8437]
4296.558139534884
1555.4418695054123


## The content is too long, so I will use Qwen to generate summarizations of the referral letter

In [14]:
## The content is too long, so I will use Qwen to generate summarizations of the referral letter
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "/data/project/Qwen2.5-7B-Instruct-1M"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [26]:
prompt_template = (
    "Please summarize this referral letter and point out the patient's backgrounds, symptoms, impacts, and goals. "
    "Your reply should be within 512 characters. \n{}")

RLList = df["text"].tolist()
responseList = []
for item in tqdm.tqdm(RLList):
    prompt = prompt_template.format(row["text"])

    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    # print(messages)

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    responseList.append(response)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 43/43 [1:17:38<00:00, 108.33s/it]


In [27]:
print(responseList[2])

### Patient Background:
NAME is a university student experiencing significant anxiety, depression, and panic attacks. She has a history of self-harming behaviors, including cutting and making herself vomit. She is currently taking sertraline and propanol for her conditions. NAME has a supportive family, including her father, brother, and his partner (a NHS counselor). She has a close-knit group of friends in the North East England.

### Symptoms:
- Panic attacks increasing in frequency.
- Depression and generalized anxiety disorder.
- Compulsion to release pressure through physical harm (e.g., pressing fingers against surfaces).
- Suicidal thoughts in September/October 2022, no current suicidal intent.
- History of making herself vomit, which she has recently stopped.
- Difficulty sleeping, often staying up until late afternoon.
- Struggles with motivation and completing assignments.

### Impacts:
- Daily life is described as "debilitating."
- Sleep issues, eating problems, and reduced

In [28]:
print(responseList[18])

### Patient Background:
NAME is a university student diagnosed with depression and Generalized Anxiety Disorder (GAD). They experienced an increase in panic attacks and developed compulsive behaviors, such as making themselves vomit, which they recently discontinued. NAME has a history of bereavement affecting their mental health and has been managing anxiety since Year 9. Medications include sertraline and propanol, prescribed by their GP.

### Symptoms and Impacts:
NAME reports panic attacks, increased frequency of depression, and a growing compulsion to self-harm, particularly during and after panic attacks. This has led to daily life being "debilitating," with sleep disturbances, difficulty waking up, and reduced motivation for studies. They have missed university due to anxiety and have been using unhealthy coping mechanisms like pressing on objects painfully. NAME seeks therapy to explore coping strategies and manage current emotions without relying on past experiences.

### Goal

In [29]:
df["summarization"] = responseList

In [30]:
df

Unnamed: 0.1,Unnamed: 0,IAPTus_Num,Referral Date,Age_ReferralRequest_ReceivedDate,EthnicDescGroupCode,EthnicCategoryGroupShortCode,GenderIdentity,SexualOrientationDesc,EndDesc,EndDescGroupShort,...,Item222,Item223,Item224,Item225,patient,round,path,label,text,summarization
0,115,24804,2020-05-10,4.472136,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,24804,0,/data/project/myworkspace/LOD/Dataset/Referral...,1,ASSESSMENT APPOINTMENT:\n\nStudent confirmed t...,"### Patient Background:\nNAME, a university st..."
1,118,24813,2020-05-11,4.582576,1.0,1.0,1,,Termition of treatment earlier than Care Profe...,Seen and treated,...,0,1,0,0,24813,0,/data/project/myworkspace/LOD/Dataset/Referral...,0,Assessment appointment\n\nRisk Assessment\n\nC...,"### Patient Background:\nNAME, a university st..."
2,119,24816,2020-05-19,4.472136,1.0,1.0,1,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,0,0,0,24816,0,/data/project/myworkspace/LOD/Dataset/Referral...,0,Student agrees with “Information for Students”...,### Patient Background:\nNAME is a university ...
3,121,24822_1,2020-05-24,4.582576,6.0,3.0,2,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,0,0,0,24822,1,/data/project/myworkspace/LOD/Dataset/Referral...,0,ASSESSMENT APPOINTMENT:\n\nStudent confirmed t...,"### Summary of Patient Background, Symptoms, I..."
4,122,24822_2,2021-04-19,4.690416,6.0,3.0,2,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,0,0,0,24822,2,/data/project/myworkspace/LOD/Dataset/Referral...,0,Current thoughts/plans/intent to self-harm\n\n...,"**Patient Background:**\nNAME, a university st..."
5,124,24828,2020-05-25,5.0,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,1,0,0,24828,0,/data/project/myworkspace/LOD/Dataset/Referral...,0,Assessment appointment\n\nExploration of CORE ...,"### Patient Background, Symptoms, Impacts, and..."
6,127,24837,2020-05-25,4.472136,1.0,1.0,1,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,24837,0,/data/project/myworkspace/LOD/Dataset/Referral...,1,Assessment appointment\n\nExploration of MDS s...,"### Summary of Patient Background, Symptoms, I..."
7,128,24840,2020-05-25,4.582576,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,1,0,0,24840,0,/data/project/myworkspace/LOD/Dataset/Referral...,1,[<50 min assessment included confidentiality l...,**Patient Background:**\nNAME has a history of...
8,136,24861,2020-06-04,4.472136,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,24861,0,/data/project/myworkspace/LOD/Dataset/Referral...,1,Exploration of MDS scores and risk\nPHQ9 -16\n...,### Patient Background:\n- **Name**: NAME\n- *...
9,137,24864,2020-06-01,4.358899,1.0,1.0,2,,Termition of treatment earlier than Care Profe...,Seen and treated,...,0,0,0,0,24864,0,/data/project/myworkspace/LOD/Dataset/Referral...,0,Teams AA Appointmen\n\n\nStudent agrees with “...,### Patient Background:\nNAME is a university ...


In [31]:
df[["IAPTus_Num", "patient", "round", "path", "text", "summarization"]].to_json("../Dataset/ReferralLetterSummary.json", orient="records", lines=True)

## Useless code, just for testing

In [18]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [19]:
response

"Certainly! A **large language model** (LLM) is an advanced type of artificial intelligence (AI) model specifically designed to understand and generate human-like text based on the input it receives. These models are trained on vast amounts of text data from the internet, books, websites, and other sources, allowing them to learn patterns, structures, and nuances in language.\n\nHere are some key characteristics of LLMs:\n\n1. **Size and Complexity**: Large language models typically consist of billions or even hundreds of billions of parameters, which enable them to capture intricate relationships between words and phrases.\n\n2. **Contextual Understanding**: They can understand the context of a sentence or passage and generate responses that are coherent and relevant.\n\n3. **Multitasking**: LLMs are capable of performing various language-related tasks, such as translation, summarization, question answering, and more, without needing task-specific training.\n\n4. **Generalization**: D

In [14]:
prompt = (
    "Please summarize this referral letter and point out the patient's backgrounds, symptoms, impacts, and goals. "
    "Your reply should be within 512 characters. \n{}")

In [15]:
prompt

"Please summarize this referral letter and point out the patient's backgrounds, symptoms, impacts, and goals. Your reply should be within 512 characters. \n{}"

In [16]:
promptList = []
for idx, row in df.iterrows():
    promptList.append(prompt.format(row["text"]))

In [17]:
print(promptList[0])

Please summarize this referral letter and point out the patient's backgrounds, symptoms, impacts, and goals. Your reply should be within 512 characters. 
ASSESSMENT APPOINTMENT:

Student confirmed their location prior to session beginning? Yes
Student agreed with “Information for Students” and confidentiality statement? Yes

Exploration of MDS scores and risk:
PHQ-9 = 9
GAD-7 = 15
Risk (question 9 in PHQ-9) = 0

Risk Assessment:
Current thoughts/plans/intent to self-harm: None disclosed when explored
Current thoughts/plans/intent of suicide: None disclosed when explored
Previous history of self-harm or suicide attempts: None disclosed when explored
Current risk to/from others: None disclosed when explored
Current or historical risk of self-neglect: None disclosed when explored

Presenting Problem:
NAME reported struggling with what she described as health/death anxiety, which is being exacerbated by the Covid-19 pandemic. She stated that she has always had worries about family members/

In [22]:
for idx, row in df.iterrows():
    print(idx)

0
1
2
3
4
5
6
7
8
9
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43


In [20]:
tqdm

<module 'tqdm' from '/data/project/anaconda3_test/envs/Redemption/lib/python3.11/site-packages/tqdm/__init__.py'>

In [27]:
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": promptList[0]}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [28]:
print(response)

**Patient Background:**
NAME, a student, grew up in the South of England. Her parents divorced when she was one, and she moved to the countryside with her neurotic and OCD-like mother. She has a challenging relationship with her father, whom she fears might "leave" her due to his past actions. She attended a girls' boarding school, where she made friends and felt more comfortable. She has a boyfriend of 2.5 years and lives with two other girls.

**Symptoms:**
NAME struggles with health/death anxiety exacerbated by the pandemic. She experiences 'compulsive checking' to gain control and has a fear of being left behind. She suffers from poor sleep and listens to podcasts to avoid intrusive thoughts. She has been diagnosed with Generalized Anxiety Disorder (GAD) and is taking Sertraline and Propranolol.

**Impacts:**
Her anxiety leads to poor sleep, oversleeping, missing lectures, and needing to catch up on missed work. She feels embarrassed to discuss her issues with her personal tutor an

In [29]:
print(len(response))

1515


In [None]:
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [14]:
# Keep 10% data as the test data
train, test = train_test_split(df, test_size=0.25)

In [15]:
train

Unnamed: 0.1,Unnamed: 0,IAPTus_Num,Referral Date,Age_ReferralRequest_ReceivedDate,EthnicDescGroupCode,EthnicCategoryGroupShortCode,GenderIdentity,SexualOrientationDesc,EndDesc,EndDescGroupShort,...,Item220,Item221,Item222,Item223,Item224,Item225,patient,round,path,label
28,228,25114,2020-12-02,5.0,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,1,0,1,0,0,25114,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
20,190,25009,2020-08-23,4.358899,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,0,0,25009,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
4,122,24822_2,2021-04-19,4.690416,6.0,3.0,2,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,1,0,0,0,0,24822,2,/data/project/myworkspace/LOD/Dataset/Referral...,0
24,215,25078,2020-10-30,4.472136,1.0,1.0,2,,Termition of treatment earlier than Care Profe...,Seen and treated,...,1,0,0,0,0,0,25078,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
29,233,25132,2020-12-16,4.358899,3.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,0,0,25132,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
13,156,24918,2020-07-12,4.582576,1.0,1.0,1,,Termition of treatment earlier than Care Profe...,Seen and treated,...,1,0,0,0,0,0,24918,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
35,467,25845,2022-05-03,4.358899,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,0,0,25845,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
12,147,24891,2020-06-15,4.690416,1.0,1.0,2,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,0,0,0,0,0,24891,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
33,454,25812,2022-05-01,4.472136,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,1,0,25812,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
15,172,24954,2020-08-05,4.690416,11.0,5.0,2,,Termition of treatment earlier than Care Profe...,Seen and treated,...,1,1,0,0,0,0,24954,0,/data/project/myworkspace/LOD/Dataset/Referral...,0


In [16]:
test

Unnamed: 0.1,Unnamed: 0,IAPTus_Num,Referral Date,Age_ReferralRequest_ReceivedDate,EthnicDescGroupCode,EthnicCategoryGroupShortCode,GenderIdentity,SexualOrientationDesc,EndDesc,EndDescGroupShort,...,Item220,Item221,Item222,Item223,Item224,Item225,patient,round,path,label
6,127,24837,2020-05-25,4.472136,1.0,1.0,1,,Mutually agreed completion of treatment,Seen and treated,...,1,1,0,0,0,0,24837,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
26,220,25093,2020-11-18,4.472136,15.0,5.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,1,0,0,0,0,25093,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
2,119,24816,2020-05-19,4.472136,1.0,1.0,1,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,0,0,0,0,0,24816,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
11,145,24885_2,2020-09-16,4.472136,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,0,0,24885,2,/data/project/myworkspace/LOD/Dataset/Referral...,0
14,169,24948,2020-08-04,4.358899,1.0,1.0,2,,Not suitable for IAPT service - no action take...,Seen but not treated,...,0,1,0,0,0,0,24948,0,/data/project/myworkspace/LOD/Dataset/Referral...,0
19,185,24991_1,2020-08-12,5.0,10.0,6.0,2,,Mutually agreed completion of treatment,Seen and treated,...,1,1,0,1,1,0,24991,1,/data/project/myworkspace/LOD/Dataset/Referral...,0
39,495,25926,2022-05-24,4.795832,1.0,1.0,1,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,0,0,0,25926,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
7,128,24840,2020-05-25,4.582576,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,1,0,0,24840,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
32,428,25734,2022-04-07,4.795832,1.0,1.0,1,,Mutually agreed completion of treatment,Seen and treated,...,0,0,0,1,0,0,25734,0,/data/project/myworkspace/LOD/Dataset/Referral...,1
40,501,25941,2022-05-24,4.898979,1.0,1.0,2,,Mutually agreed completion of treatment,Seen and treated,...,1,0,0,0,0,0,25941,0,/data/project/myworkspace/LOD/Dataset/Referral...,1


In [2]:
RFLetterList = []
for file in os.listdir("../Dataset/ReferralLetter/"):
    if file.endswith("docx"):
        # print(file)
        patient = file.split("_referral")[0]
        itemList = file.split("_referral")[-1].split(".docx")
        if itemList[0] == "":
            RFRound = 0
        else:
            RFRound = int(itemList[0])

        path = os.path.join("/data/project/myworkspace/LOD", "Dataset/ReferralLetter", file)
        dic = {}
        dic["patient"] = patient
        dic["round"] = RFRound
        dic["path"] = path
        RFLetterList.append(dic)
print(len(RFLetterList))

47


In [4]:
RFLetterList = []
for file in os.listdir("../Dataset/ReferralLetter/Anonymous data/"):
    if file.endswith("docx"):
        #print(file)
        patient = file.split("_referral")[0]
        itemList = file.split("_referral")[-1].split(".docx")
        if itemList[0] == "":
            RFRound = 0
        else:
            RFRound = int(itemList[0])

        path = os.path.join("/data/project/myworkspace/LOD", "Dataset/ReferralLetter/Anonymous data/", file)
        dic = {}
        dic["patient"] = patient
        dic["round"] = RFRound
        dic["path"] = path
        RFLetterList.append(dic)
print(len(RFLetterList))

47


In [5]:
len(RFLetterList)

47

In [29]:
list(set([x["patient"]+"_"+str(x["round"]) for x in RFLetterList]))

['25926_0',
 '24948_0',
 '25884_0',
 '25866_0',
 '25869_0',
 '25236_0',
 '25063_0',
 '25096_0',
 '25941_0',
 '24984_0',
 '25087_0',
 '25060_0',
 '26058_0',
 '26013_0',
 '25009_0',
 '24813_0',
 '25734_0',
 '24885_2',
 '24493_0',
 '24861_0',
 '24954_0',
 '25132_0',
 '25093_0',
 '24957_0',
 '24804_0',
 '24843_0',
 '25435_0',
 '24960_0',
 '25078_0',
 '25677_0',
 '25114_0',
 '24864_0',
 '24828_0',
 '24816_0',
 '25845_0',
 '24822_2',
 '25833_0',
 '25812_0',
 '25027_0',
 '26196_0',
 '24822_1',
 '24840_0',
 '24837_0',
 '24891_0',
 '24885_1',
 '24918_0',
 '24991_1']

In [30]:
len(list(set([x["patient"]+"_"+str(x["round"]) for x in RFLetterList])))

47

In [5]:
sorted(RFLetterList, key=lambda x: x["patient"])

[{'patient': '24493',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24493_referral.docx'},
 {'patient': '24493',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/Anonymous data/24493_referral.docx'},
 {'patient': '24804',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24804_referral.docx'},
 {'patient': '24804',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/Anonymous data/24804_referral.docx'},
 {'patient': '24813',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24813_referral.docx'},
 {'patient': '24813',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/Anonymous data/24813_referral.docx'},
 {'patient': '24816',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24816_referral.docx'},
 {'patient': '24816',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralL

In [25]:
RFLetterList

[{'patient': '24885',
  'round': 2,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24885_referral2.docx'},
 {'patient': '24864',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24864_referral.docx'},
 {'patient': '24804',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24804_referral.docx'},
 {'patient': '25866',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/25866_referral.docx'},
 {'patient': '25009',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/25009_referral.docx'},
 {'patient': '25093',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/25093_referral.docx'},
 {'patient': '24840',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24840_referral.docx'},
 {'patient': '24837',
  'round': 0,
  'path': '/data/project/myworkspace/LOD/Dataset/ReferralLetter/24837_referral.docx'},
 {'patient': '2

In [15]:
"24885_referral2.docx".split("_referral")[-1].split(".docx")

['2', '']

In [16]:
"24885_referral.docx".split("_referral")[-1].split(".docx")

['', '']

In [23]:
ll /data/project/myworkspace/LOD/Dataset/ReferralLetter/24885_referral1.docx

-rwxr-xr-x 1 litian 20379 Apr  3 17:38 [0m[01;32m/data/project/myworkspace/LOD/Dataset/ReferralLetter/24885_referral1.docx[0m[K*


In [None]:
from docx import Document

In [7]:
from pypdf import PdfReader

reader = PdfReader("../Dataset/ReferralLetter/24948_referral.pdf")
for page in reader.pages:
    text = page.extract_text()
    print(text)

ways to contact us 
Tel:  0191 282 6600
Email:  tnu-tr.PCMH@nhs.net
Website: www.talkinghelpsnewcastle.org
 
Talking Helps Newcastle is an alliance between the services below information is shared across the alliance (please
see our website for further information) 
Office address:  3rd Floor, Newcroft House, Market Street East, Newcastle.  NE1 6ND
   
 
 
3 February 2021
Email: Jeanette.Walker-Guy@newcastle.ac.uk
Dear Jeanette Walker-Guy, 
Please find below a copy of my assessment notes from Jasmine Aherne's assessment on
27/01/2021. 
Jasmine said she is a student at Newcastle University, she said at present she is living in Burnley with her parents due to
lockdown and she plans to return to Newcastle after this lockdown has ended.
Main Problem: Jasmine said her main problem is anxiety, she also reported symptoms of depression and issues with
eating. Jasmine explained she has been aware of her anxiety since the age of 11. She described herself as being 'an
anxious child.' Said during 

In [8]:
doc = Document("../Dataset/ReferralLetter/24948_referral.docx")
for para in doc.paragraphs:
    print(para.text)






3 February 2021
Email: Jeanette.Walker-Guy@newcastle.ac.uk


Dear Jeanette Walker-Guy,
ways to contact us
Tel: 0191 282 6600
Email:  tnu-tr.PCMH@nhs.net Website: www.talkinghelpsnewcastle.org


Please find below a copy of my assessment notes from Jasmine Aherne's assessment on 27/01/2021.
Jasmine said she is a student at Newcastle University, she said at present she is living in Burnley with her parents due to lockdown and she plans to return to Newcastle after this lockdown has ended.
Main Problem: Jasmine said her main problem is anxiety, she also reported symptoms of depression and issues with eating. Jasmine explained she has been aware of her anxiety since the age of 11. She described herself as being 'an anxious child.' Said during school she was shy and kept herself to herself. She was unable to identify a specific trigger for her anxiety.
Jasmine explained over the past year her symptoms of anxiety have got much worse which has led to her experiencing symptoms of depressio

In [2]:
doc = Document("../Dataset/ReferralLetter/24493_referral.docx")
for para in doc.paragraphs:
    print(para.text)

Psychology Training Clinic

From:	Psychology Training Clinic
Sent:	29 April 2021 14:08
To:	Jade Beeching
Subject:	FW: 190181910/24420

Hi Jade
We will contact the client and offer her an assessment. Kind regards
Karen

From: Jade Beeching <Jade.Beeching@newcastle.ac.uk>
Sent: 28 April 2021 12:22
To: Psychology Training Clinic <psychology.clinic@newcastle.ac.uk>; Kate Aitchison
<kate.aitchison@newcastle.ac.uk>
Subject: 190181910 Hello,
I’ve recently received contact from the above student, Emily Hawkins. She has asked whether it would be possible to please receive some additional support following an increase in OCD symptoms.
She explained that she has experienced a decline in her mental health and although she is better able to manage the physical symptoms, the cognitive cycles have had a significant social impact. Emily reported that she often experiences panic and negative thought cycles which have affected her friendships.

Risk ‐ increased intrusive thoughts around self‐harm. Emily

In [3]:
doc = Document("../Dataset/ReferralLetter/24804_referral.docx")
for para in doc.paragraphs:
    print(para.text)

ASSESSMENT APPOINTMENT:

Date: 06/11/20
Student name: Olivia Jowett
Student’s preferred name: Oli
Student number: 180028320
Therapist: Alison Marley

Student confirmed their location prior to session beginning? Yes
Student agreed with “Information for Students” and confidentiality statement? Yes

Exploration of MDS scores and risk:
PHQ-9 = 9
GAD-7 = 15
Risk (question 9 in PHQ-9) = 0

Risk Assessment:
Current thoughts/plans/intent to self-harm: None disclosed when explored
Current thoughts/plans/intent of suicide: None disclosed when explored
Previous history of self-harm or suicide attempts: None disclosed when explored
Current risk to/from others: None disclosed when explored
Current or historical risk of self-neglect: None disclosed when explored

Presenting Problem:
Oli reported struggling with what she described as health/death anxiety, which is being exacerbated by the Covid-19 pandemic. She stated that she has always had worries about family members/friends becoming ill or dying 

In [5]:
doc = Document("../Dataset/ReferralLetter/Anonymous data/25236_referral.docx")
for para in doc.paragraphs:
    print(para.text)

Current thoughts/plans/intent to self-harm
Some thoughts and feelings of SH but NAME hasn’t acted upon these thoughts - she protects herself from telling herself to not do it. NAME feels it isn’t an option. 
Current thoughts/plans/intent of suicide
Less thoughts with this than the self-harm. NAME doesn’t want to die she just doesn’t want to live anymore with these feelings. NAME wants her feelings to end but not her life. No active plan currently. NAME stated 'I just wouldn't kill myself because I am scared and have protective factors in place'. 
Previous history of self-harm or suicide attempts
History of self harm but no history of suicide attempts 
Current risk to/from others
NAME is fearful of harm from others but rationally NAME knows they would not harm her. 
Current or historical risk of self-neglect
N/A 
Presenting Problem
NAME described a long term history of anxiety and low mood. She often worries about things outside of her control or future things, finding she blows things 