In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/HRP/

/content/drive/MyDrive/HRP


In [3]:
!ls

extracted_pharmacy_notes.csv  unique_pharmacy_notes.csv


In [4]:
!pip install gpt4all[cuda]

Collecting gpt4all[cuda]
  Downloading gpt4all-2.8.2-py3-none-manylinux1_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-runtime-cu11 (from gpt4all[cuda])
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cublas-cu11 (from gpt4all[cuda])
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading gpt4all-2.8.2-py3-none-manylinux1_x86_64.whl (121.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl (417.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl (875 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m50.5 MB/s[0m et

In [5]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from gpt4all import GPT4All
from pathlib import Path
from tqdm import tqdm

In [6]:
model_dir = Path(r"./")
model_dir.mkdir(exist_ok=True)

model = GPT4All(
    model_name="Meta-Llama-3-8B-Instruct.Q4_0.gguf",
    model_path=model_dir,
    allow_download=True,
    n_threads=4,
    device="cuda", # lets use that gpu
    ngl=100
)

Downloading: 100%|██████████| 4.66G/4.66G [01:52<00:00, 41.3MiB/s]
Verifying: 100%|██████████| 4.66G/4.66G [00:29<00:00, 160MiB/s]


In [7]:
def make_prompt(text):
  prompt = (
      "You are a medical expert. Extract dosages for all drugs: "
      "Return ONLY the following list [(drug,dosage,unit,frequency)] for each drug return nothing else at all"
      f"{text}\nAnswer:"
  )
  return prompt

def extract_with_gpt4all(row, max_tokens=256):
    prompt = make_prompt(row["note_text"])

    response = model.generate(prompt, n_predict=max_tokens)
    return response.strip()

In [8]:
csv_path = "unique_pharmacy_notes.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,note_row_id,note_text
0,314570,"""TITLE: PHARMACY\n SEDATION\n Mr. [**Known l..."
1,314572,"""TITLE: PHARMACY\n SEDATION\n ASSESSMENT:\n ..."
2,314582,"""PHARMACY - VANCOMYCIN\n ASSESSMENT:\n Mr...."
3,314707,"""PHARMACY\n VANCO DOSING IN CRRT\n ASSESSMEN..."
4,314739,"""Pharmacy Note\n TRANSITIONING and WEANING O..."


In [9]:
tqdm.pandas(desc="Extracting with GPT4All")
df["extraction"] = df.progress_apply(extract_with_gpt4all, axis=1)

df.to_csv("extracted_pharmacy_notes.csv", index=False)

Extracting with GPT4All: 100%|██████████| 103/103 [25:19<00:00, 14.75s/it]


In [10]:
df.head()

Unnamed: 0,note_row_id,note_text,extraction
0,314570,"""TITLE: PHARMACY\n SEDATION\n Mr. [**Known l...","[(fentanyl,50 mcg/hr,hourly), (midazolam,2 mg/..."
1,314572,"""TITLE: PHARMACY\n SEDATION\n ASSESSMENT:\n ...","[(drug, dosage, unit, frequency)]\n[(fentanyl,..."
2,314582,"""PHARMACY - VANCOMYCIN\n ASSESSMENT:\n Mr....","[(vancomycin,1000 mg,q48h)] Note that there is..."
3,314707,"""PHARMACY\n VANCO DOSING IN CRRT\n ASSESSMEN...","[(Vancomycin,1 gram,PRN,"""")]\n([(Vancomycin,1 ..."
4,314739,"""Pharmacy Note\n TRANSITIONING and WEANING O...","[(methadone,20mg,q6h),(lorazepam,2mg,four hour..."
