In [1]:
import pandas as pd

file_path = "../dataset/Radiologists Notes for Lumbar Spine MRI Dataset/Radiologists Report.xlsx"

### Overall view of the dataset

Two columns
- Patient ID: Patient ID
- Clinician's Notes: Patient note - may be the coditions to be query on

Data:
- There is null data for "Clinician's Notes" column - This is the removed studies mentioned in the paper --> We will remove them too

Conclusion:
- Patient ID can be extracted to be the node property
- Clinician's Notes needs more investigation before the start of implementation

In [2]:
df = pd.read_excel(file_path)

patient_id, patient_note = df.columns

# df[patient_note] = df[patient_note].astype(str)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575 entries, 0 to 574
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Patient ID         575 non-null    int64 
 1   Clinician's Notes  515 non-null    object
dtypes: int64(1), object(1)
memory usage: 9.1+ KB
None
   Patient ID                                  Clinician's Notes
0           1  L4-5: degenerative annular disc bulge is noted...
1           2  No evidence of disc herniation.\nNo significan...
2           3  LSS MRI\nFeatures of muscle spasm.\nsmall cent...
3           4  Feature of muscle spasm.\nDiffuse disc bulges ...
4           5  LSS MRI :\nFeature of muscle spasm.\nDiffuse d...


In [3]:
df = df[~df[patient_note].apply(lambda x: not isinstance(x, str))]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 515 entries, 0 to 574
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Patient ID         515 non-null    int64 
 1   Clinician's Notes  515 non-null    object
dtypes: int64(1), object(1)
memory usage: 12.1+ KB
None


### Investigation into Clinician's Notes

The number of symptoms varies from patient to patient mostly focusing on the range from 1 to 5 symptoms per patient, while the highest record witnesses 12 symptoms for a patient  

In [4]:
note_quantity = df[patient_note].apply(lambda x: len([z for z in x.split("\n") if z.strip() != ""]))

print(note_quantity.describe())

for q in note_quantity.unique():
    print(f"Number of notes with {q} lines: {df[note_quantity == q].shape[0]}")

count    515.000000
mean       3.355340
std        1.458887
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max       10.000000
Name: Clinician's Notes, dtype: float64
Number of notes with 1 lines: 42
Number of notes with 2 lines: 97
Number of notes with 4 lines: 124
Number of notes with 3 lines: 164
Number of notes with 5 lines: 53
Number of notes with 6 lines: 18
Number of notes with 7 lines: 11
Number of notes with 10 lines: 2
Number of notes with 9 lines: 2
Number of notes with 8 lines: 2


In [18]:
import re

def clean(text):
    text = re.sub(r'^[^A-Za-z]+|[^A-Za-z]+$', '', text)
    text = text.strip()
    if not text.isupper():
        text = text.lower()
    return text

note_description = df[patient_note].apply(lambda x: [clean(z) for z in x.split("\n") if z.strip() != ""])
unique_value = set()

def update_value(value_list):
    unique_value.update(value_list)
    
note_description.apply(update_value)


print(f"Total unique notes: {len(unique_value)}")


unique_value_count = dict.fromkeys(unique_value, 0)

def update_count(value_list):
    for v in value_list:
        unique_value_count[v] += 1

note_description.apply(update_count)

count = 0
for key, value in unique_value_count.items():
    if value > 10:
        print(f"Note: {key} - Count: {value}")
        count += 1
        
print(f"Total notes with more than 10 occurrences: {count}\n\n")


for key, value in unique_value_count.items():
    if key.isupper():
        print(f"Unique Note: {key}")


Total unique notes: 845
Note: feature of muscle spasm - Count: 175
Note: no significant thecal sac or nerve root compression - Count: 11
Note: the spinal canal is still adequate - Count: 12
Note: no evidence of disc herniation - Count: 48
Note: LSS MRI - Count: 303
Note: mild disc bulge noted at l4-l5 level - Count: 11
Note: features of muscle spasm - Count: 20
Note: lumbosacral mri - Count: 49
Note: no significant thecal sac or nerve root compression noted - Count: 62
Note: adequate spinal canal - Count: 47
Total notes with more than 10 occurrences: 10


Unique Note: MRI OF THE L. SPINE
Unique Note: L5-S1, L4-L
Unique Note: MRI LSS
Unique Note: L3-L4, L4-L
Unique Note: MRI OF THE LUMBOSACRAL SPINE
Unique Note: NB
Unique Note: C.SPINE MRI
Unique Note: L.SS MRI
Unique Note: D SPINE MRI
Unique Note: LSS MRI L
Unique Note: LSS  MRI
Unique Note: MRI OF THE C. SPINE
Unique Note: MRI OF THE LUMBAR SPINE
Unique Note: SI JOINTS MRI
Unique Note: L4-L
Unique Note: LSS MRI
Unique Note: C SPINE AN