# Import Libraries

In [38]:
import spacy
import os
import json
import pandas as pd
import pickle
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseDownload
from bs4 import BeautifulSoup
import requests
import random
from spacy.training import Example
from spacy.util import minibatch, compounding
import io
import re
import datetime

# Data Collection

--------

In [39]:
# if csv exists, load it
if os.path.exists('../../clean_data/medications.csv'):
    medications = pd.read_csv('../../clean_data/medications.csv')
# otherwise, scrape the data
else:
    url = "https://healthy.kaiserpermanente.org/health-wellness/drug-encyclopedia."

    medications = []

    # iterate from 'a' to 'z'
    for letter in range(97, 123):
        url = url + chr(letter)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        for li in soup.select(".drug-column-4"):
            medications.append(li.text)
        
        url = "https://healthy.kaiserpermanente.org/health-wellness/drug-encyclopedia."

    # split by new line
    medications = [medication.split('\n') for medication in medications]
    # flatten and remove empty strings
    medications = [medication for sublist in medications for medication in sublist if medication != '']
    medications = pd.DataFrame(medications, columns=['medication'])
    medications.to_csv('../clean_data/medications.csv', index=False)


print(medications)

        medication
0    acetaminophen
1        acyclovir
2         Adderall
3        albuterol
4      alendronate
..             ...
375    Zithromycin
376         Zoloft
377       Zolpidem
378        Zovirax
379         Zyrtec

[380 rows x 1 columns]


380 medication names to randomize NER data

In [40]:
# Define the scope of the application
SCOPES = ['https://www.googleapis.com/auth/drive']
# Function to authenticate and create the service
def create_service():
    creds = None
    # The file token.pickle stores the user's access and refresh tokens.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    
    service = build('drive', 'v3', credentials=creds)
    return service

# Function to list files in a given folder ID
def list_files_in_folder(service, folder_id):
    results = service.files().list(q=f"'{folder_id}' in parents", fields="nextPageToken, files(id, name)").execute()
    items = results.get('files', [])
    return items

# Function to download a file
def download_or_export_file(service, file_id, file_name, mime_type):
    try:
        # Check if the file is a Google Doc by its MIME type
        if mime_type.startswith('application/vnd.google-apps.'):
            # Define export MIME type for Google Docs (e.g., 'application/pdf' for Google Docs)
            if mime_type == 'application/vnd.google-apps.document':
                export_mime_type = 'application/pdf'
                file_name += '.pdf'  # Append appropriate file extension
            elif mime_type == 'application/vnd.google-apps.spreadsheet':
                export_mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
                file_name += '.xlsx'  # Append appropriate file extension
            elif mime_type == 'application/vnd.google-apps.presentation':
                export_mime_type = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
                file_name += '.pptx'  # Append appropriate file extension
            else:
                # Default to PDF for other Google Apps documents
                export_mime_type = 'application/pdf'
                file_name += '.pdf'
            
            request = service.files().export_media(fileId=file_id, mimeType=export_mime_type)
        else:
            # For binary files, use the get_media method
            request = service.files().get_media(fileId=file_id)
        
        # Perform the download or export
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
            print(f"Download {int(status.progress() * 100)}%.")
        
        # Write the file's contents to a local file
        with open(file_name, 'wb') as f:
            f.write(fh.getbuffer())
        print(f"File '{file_name}' downloaded successfully.")
    
    except Exception as e:
        print(f"An error occurred: {e}")

def find_folders_by_name(service, folder_name):
    """Find folders by name and return their IDs."""
    query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}'"
    response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    return response.get('files', [])

def find_subfolder_id(service, parent_folder_id, subfolder_name):
    """Find a specific subfolder within a parent folder."""
    query = f"'{parent_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and name='{subfolder_name}'"
    response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    files = response.get('files', [])
    if files:
        return files[0]['id']  # Return the ID of the first matching subfolder
    return None

def download_txt_files_from_folder(service, folder_id):
    """Download all .txt files from a specified folder."""
    query = f"'{folder_id}' in parents and mimeType='text/plain'"
    response = service.files().list(q=query, spaces='drive', fields='files(id, name, mimeType)').execute()
    files = response.get('files', [])
    for file in files:
        print(f"Downloading/exporting {file['name']}...")
        download_or_export_file(service, file['id'], file['name'], file['mimeType'])

Toy diagnosis data. Might look into a better source, but this will work for now

In [41]:
# again, if csv exists, load it
if os.path.exists('../../clean_data/diagnoses.csv'):
    diagnoses = pd.read_csv('../../clean_data/diagnoses.csv')
# otherwise, download it from Google Drive
else:
    service = create_service()  # Assume this is implemented as shown before
    top_level_folder_names = ['Base-Game', 'Mod-Diagnoses']

    for folder_name in top_level_folder_names:
        folders = find_folders_by_name(service, folder_name)
        for folder in folders:
            dept_diagnoses_folder_id = find_subfolder_id(service, folder['id'], 'Dept-Diagnoses')
            if dept_diagnoses_folder_id:
                download_txt_files_from_folder(service, dept_diagnoses_folder_id)
    
    diagnoses = []
    # loop through all files and extract the text following '##' (diagnosis names)
    for file in os.listdir('../../raw_data/diagnoses'):
        with open(f'../../raw_data/diagnoses/{file}', 'r') as f:
            for line in f:
                if '##' in line:
                    diagnoses.append(line.split('##')[1].strip())
    
    diagnoses = pd.DataFrame(diagnoses, columns=['diagnosis']).to_csv('../../clean_data/diagnoses.csv', index=False)

print(diagnoses)

                                 diagnosis
0                  Acute Myeloid Leukaemia
1     Adrenocortical Carcinoma (Localised)
2    Adrenocortical Carcinoma (Metastatic)
3      Adrenocortical Carcinoma (Regional)
4                             ALL (B Cell)
..                                     ...
629                           Typhus Fever
630                           Valley Fever
631                        West Nile Fever
632                           Yellow Fever
633                             Zika Fever

[634 rows x 1 columns]


634 diagnoses to randomize NER training data

Some dosages are nonsensical, but will hopefully allow the model to generalize well

In [42]:
# randomly generate dosage data
dosages = []
units = ['mg', 'g', 'mL', 'L']
concat_every = 5
frequency = ['twice daily', 'once daily', 'as needed']

for hour in range(48):
    frequency.append(f'every {hour} hours')

for i in range(300):
    dosage = str(random.choice(range(5, 1001, 5))) + f' {random.choice(units)}'
    if i % concat_every == 0:
        dosage += ' ' + random.choice(frequency)
    dosages.append(dosage)

print(dosages)

['865 g twice daily', '720 L', '95 mL', '545 mg', '595 g', '400 mg every 15 hours', '110 mg', '390 mL', '225 g', '450 g', '40 mL every 1 hours', '105 mL', '90 mL', '375 mL', '735 g', '20 L every 20 hours', '235 mg', '920 mL', '800 g', '310 mg', '965 L every 17 hours', '650 mg', '30 L', '105 mL', '920 mg', '655 mg every 6 hours', '810 g', '345 mL', '505 g', '780 mL', '130 mL every 39 hours', '305 mg', '155 mL', '895 mL', '695 mg', '785 L every 32 hours', '20 mL', '880 g', '55 L', '335 L', '775 L every 23 hours', '725 mL', '475 L', '665 mL', '360 g', '370 mL every 10 hours', '290 g', '265 L', '485 mg', '665 mL', '540 L every 22 hours', '480 L', '780 L', '140 g', '685 g', '500 g every 34 hours', '265 mL', '860 mg', '105 g', '190 mL', '295 L every 7 hours', '295 mg', '385 L', '400 g', '330 mg', '785 L every 18 hours', '280 mg', '85 mg', '265 mg', '125 mg', '995 L every 32 hours', '375 g', '230 L', '30 g', '615 mL', '175 L every 41 hours', '715 L', '235 g', '915 mg', '460 mg', '110 g every 

Tests

In [43]:
# if csv exists, load it
if os.path.exists('../../clean_data/tests.csv'):
    tests = pd.read_csv('../../clean_data/tests.csv')
# otherwise, scrape the data
else:
    url = "https://medlineplus.gov/lab-tests/"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    tests = []
    # select all uorderdered lists with class 'withident breaklist'
    for item in soup.select(".withident.breaklist"):
        tests.append(item.text)

    tests = pd.DataFrame(tests, columns=['test'])
    tests['test'] = tests['test'].str.split('\n')
    tests = tests.explode('test')
    tests = tests[tests['test'] != '']
    tests.to_csv('../../clean_data/tests.csv', index=False)

Symptoms

In [44]:
# if csv exists, load it
if os.path.exists('../../clean_data/symptoms.csv'):
    symptoms = pd.read_csv('../../clean_data/symptoms.csv')
else:
    symptoms = []
    pattern = r"\+\s(.+?)\s\(\d+% of cases \| .+?\)"

    # loop through all files and extract the text following '##' (diagnosis names)
    for file in os.listdir('../../raw_data/diagnoses'):
        with open(f'../../raw_data/diagnoses/{file}', 'r') as f:
            for line in f:
                match = re.search(pattern, line)
                if match:
                    symptoms.append(match.group(1))

    symptoms = pd.DataFrame(symptoms, columns=['symptom'])
    # make unique
    symptoms = symptoms.drop_duplicates()
    symptoms.to_csv('../../clean_data/symptoms.csv', index=False)

Dates

In [45]:
dates = []

# generate dates in the format "Month Day"
for i in range(1, 13):
    for j in range(1, 29):
        # year is arbitrary
        date = datetime.date(2021, i, j)
        full_date = f"{date.strftime('%B')} {date.strftime('%d')}"
        dates.append(full_date)

print(dates)

['January 01', 'January 02', 'January 03', 'January 04', 'January 05', 'January 06', 'January 07', 'January 08', 'January 09', 'January 10', 'January 11', 'January 12', 'January 13', 'January 14', 'January 15', 'January 16', 'January 17', 'January 18', 'January 19', 'January 20', 'January 21', 'January 22', 'January 23', 'January 24', 'January 25', 'January 26', 'January 27', 'January 28', 'February 01', 'February 02', 'February 03', 'February 04', 'February 05', 'February 06', 'February 07', 'February 08', 'February 09', 'February 10', 'February 11', 'February 12', 'February 13', 'February 14', 'February 15', 'February 16', 'February 17', 'February 18', 'February 19', 'February 20', 'February 21', 'February 22', 'February 23', 'February 24', 'February 25', 'February 26', 'February 27', 'February 28', 'March 01', 'March 02', 'March 03', 'March 04', 'March 05', 'March 06', 'March 07', 'March 08', 'March 09', 'March 10', 'March 11', 'March 12', 'March 13', 'March 14', 'March 15', 'March 

Times

In [46]:
times = []
# generate times in the format "Hour AM/PM"
for i in range(1, 13):
    for j in ["AM", "PM"]:
        time = f"{i} {j}"
        times.append(time)

Body parts

Hard-coding for now. Will change

In [47]:
anatomies = ["left femur", "right knee", "abdominal region", "left lung", "right lung", "left kidney", "right kidney",
             "left eye", "right eye", "left ear", "right ear", "left hand", "right hand", "left foot", "right foot",
             "left arm", "right arm", "left leg", "right leg", "left shoulder", "right shoulder", "left hip", "right hip",
             "left elbow", "right elbow", "left wrist", "right wrist", "left ankle", "right ankle", "left toe", "right toe",
             "left finger", "right finger", "left thumb", "right thumb", "left nostril", "right nostril", "left cheek", "right cheek",
             "left temple", "right temple", "left jaw", "right jaw", "left chin", "right chin", "left neck", "right neck", "left collarbone",
             "right collarbone", "left rib", "right rib", "left hip bone", "right hip bone", "left thigh", "right thigh", "left calf",
             "right calf", "left shin", "right shin", "left heel", "right heel", "left sole", "right sole", "left toe", "right toe",
             "left finger", "right finger", "left thumb", "right thumb", "left palm", "right palm", "left wrist", "right wrist", "left forearm",
             "right forearm", "left bicep", "right bicep", "left tricep", "right tricep", "left shoulder", "right shoulder", "left chest", "right chest",
             "left breast", "right breast", "left nipple", "right nipple", "left rib", "right rib", "left abdomen", "right abdomen", "left hip",
             "right hip", "left groin", "right groin", "left thigh", "right thigh", "left knee", "right knee", "left shin", "right shin", "left calf",
             "right calf", "left ankle", "right ankle", "left foot", "right foot", "left toe", "right toe", "left finger", "right finger", "left thumb",
             "right thumb", "left hand", "right hand", "left wrist", "right wrist", "left forearm", "right forearm", "left elbow", "right elbow",
             "left upper arm", "right upper arm", "left shoulder", "heart", "liver", "stomach", "intestines", "pancreas", "spleen", "bladder", "esophagus"]

len(anatomies)

132

In [48]:
# make unique
anatomies = list(set(anatomies))
len(anatomies)

92

-------------------

# Data Processing

---------

ToDo: Need to combine logic of next two cells to only generate a train_data.csv instead of two files

In [49]:
# Function to generate a random date
def generate_example():
    output = {}
    diagnosis = random.choice(diagnoses['diagnosis'].values)
    medication = random.choice(medications['medication'].values)
    dosage = random.choice(dosages)
    test_name = random.choice(tests['test'].values)
    symptom = random.choice(symptoms['symptom'].values)
    body_part = random.choice(anatomies)

    choices = [diagnosis, medication, dosage, test_name, symptom, body_part]
    choice_map = ['diagnosis', 'medication', 'dosage', 'test_name', 'symptom', 'body_part']
    entities = []

    text_elements = [
        f"The patient was diagnosed with {diagnosis} last year.",
        f"He has been prescribed {medication} {dosage}.",
        f"{test_name} measurements indicate {diagnosis}.",
        f"The {test_name} revealed a {diagnosis} in the {body_part}.",
        f"Patient presents with {symptom}.",
        f"Prescribe {dosage} of {medication} for pain relief.",
        f"The {test_name} shows normal {body_part} function.",
        f"She mentioned an allergy to {medication}.",
        f"Examine the {symptom} in the patient's {body_part}.",
    ]
    text = random.choice(text_elements)
    for index, choice in enumerate(choices):
        if text.find(choice) != -1:
            entities.append({"start": text.find(choice), "end": text.find(choice) + len(choice), "label": f"{choice_map[index]}"})
    
    output["text"] = text
    output["entities"] = entities
    return output

# Generate 1000 examples
train_data = [generate_example() for _ in range(1000)]

# Save the examples to a JSON file
file_path = '../../clean_data/train_data.json'
with open(file_path, 'w') as file:
    json.dump(train_data, file, indent=2)

In [50]:
# Load JSON data
with open('../../clean_data/train_data.json', 'r') as file:
    data = json.load(file)

# Convert the JSON data to the desired format
formatted_data = []
for item in data:
    text = item['text']
    entities = []
    for entity in item['entities']:
        start = entity['start']
        end = entity['end']
        label = entity['label']
        entities.append((start, end, label.upper()))  # Convert label to uppercase as shown in the example
    formatted_data.append((text, {"entities": entities}))

# Convert to DataFrame for easy CSV saving
df = pd.DataFrame(formatted_data, columns=['Text', 'Entities'])

# Save to CSV
df.to_csv('../../clean_data/formatted_train_data.csv', index=False)

In [51]:
entity_labels = set()
for text, annotations in formatted_data:
    for entity in annotations['entities']:
        entity_labels.add(entity[2])

print(entity_labels)

{'TEST_NAME', 'BODY_PART', 'MEDICATION', 'SYMPTOM', 'DOSAGE', 'DIAGNOSIS'}


---

In [52]:
retrain = True # Set to True to retrain the model

if os.path.exists('../../clean_data/models/ner_model') and not retrain:
    nlp = spacy.load('../../clean_data/models/ner_model')
else:
    # Load a blank model
    # nlp = spacy.blank('en')

    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load('en_core_web_sm') # use a pre-trained model

    # Add the NER pipeline if not already present
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner')
    else:
        ner = nlp.get_pipe('ner')

    # Add entity labels to the model
    for entity in entity_labels:
        ner.add_label(entity)

    optimizer = nlp.resume_training()
    for itn in range(10):  # Number of training iterations
        random.shuffle(formatted_data)
        losses = {}
        for batch in minibatch(formatted_data, size=compounding(4.0, 32.0, 1.001)):
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
        print("Losses", losses)

    # Save the model
    nlp.to_disk('../../clean_data/models/ner_model')

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 21.5 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 2055.0547043412544}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 832.7350446841641}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 394.5145583335001}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 260.49499817842656}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 173.4652899405811}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 147.779243923613}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 90.77088567459491}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 93.06171735056289}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 82.61153124124401}
Losses {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 162.6401452881953}


Will need more rigorous testing/improvement im sure, but fine for now

In [53]:
# Example text
text = "Patient was administered 500mg of Ibuprofen."

# Process the text
doc = nlp(text)

# Iterate over the predicted entities
for ent in doc.ents:
    print(ent.text, ent.label_)

500mg DOSAGE
Ibuprofen MEDICATION
