<a href="https://colab.research.google.com/github/Phani-Raj-Goud/MIMIC-Data-Extraction/blob/main/mimic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#MIMIC Dataset ICD Code extraction

In [1]:
!pip install chromadb sentence-transformers

Collecting chromadb
  Downloading chromadb-1.3.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [2]:
import kagglehub
import nltk
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions

### Download Dataset containing patient discharge notes

In [3]:
path1 = kagglehub.dataset_download("mehrnooshazizi/mimic-iv-dataset")
print("Path to dataset files:", path1)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mehrnooshazizi/mimic-iv-dataset?dataset_version_number=1...


100%|██████████| 4.02M/4.02M [00:00<00:00, 138MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/mehrnooshazizi/mimic-iv-dataset/versions/1





### Download Dataset containing MIMIC - III ICD 9 codes

In [4]:
path2 = kagglehub.dataset_download("bilal1907/mimic-iii-10k")
print("Path to dataset files:", path2)

Downloading from https://www.kaggle.com/api/v1/datasets/download/bilal1907/mimic-iii-10k?dataset_version_number=1...


100%|██████████| 1.12G/1.12G [00:13<00:00, 88.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/bilal1907/mimic-iii-10k/versions/1


### Load ICD codes for diagnoses

In [5]:
icd_dignoses_df = pd.read_csv(path2+'/MIMIC -III (10000 patients)/D_ICD_DIAGNOSES/D_ICD_DIAGNOSES.csv')

In [6]:
icd_dignoses_df.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [7]:
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
DB_PATH = "./chromadb_icd_codes"
COLLECTION_NAME = "icd_codes_collection"
TOP_K_RESULTS = 1
MAX_BATCH_SIZE = 5000 # Max batch size for chromadb is 5461

### Fucntion to chunck ICD codes into processable batch sizes

In [8]:
def chunk_list(data_list, batch_size):
    for i in range(0, len(data_list), batch_size):
        yield data_list[i:i + batch_size]

### Function to create chromaDB vector database consisting of ICD 9 code descriptions

In [9]:
def setup_chroma_db_and_load_data(df):
    print(f"Initializing ChromaDB with model: {EMBEDDING_MODEL}")

    # Initialize the embedding function
    hf_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_MODEL
    )

    # Initialize ChromaDB client (local storage)
    client = chromadb.PersistentClient(path=DB_PATH)

    # Create a collection
    collection = client.get_or_create_collection(
        name=COLLECTION_NAME,
        embedding_function=hf_ef
    )

    print(f"Loaded {len(df)} records from data source.")


    documents = df['LONG_TITLE'].tolist()
    metadatas = [{'icd9_code': code} for code in df['ICD9_CODE'].tolist()]
    ids = [f"doc_{i}" for i in range(len(df))]

    doc_chunks = list(chunk_list(documents, MAX_BATCH_SIZE))
    meta_chunks = list(chunk_list(metadatas, MAX_BATCH_SIZE))
    id_chunks = list(chunk_list(ids, MAX_BATCH_SIZE))

    total_chunks = len(doc_chunks)
    print(f"Splitting data into {total_chunks} batches of size up to {MAX_BATCH_SIZE}.")

    for i in range(total_chunks):
        print(f"  -> Processing batch {i + 1}/{total_chunks} (Size: {len(doc_chunks[i])})")

        # Add the current batch of documents, metadata, and IDs
        collection.add(
            documents=doc_chunks[i],
            metadatas=meta_chunks[i],
            ids=id_chunks[i]
        )

    print(f"\nSuccessfully added ALL data to ChromaDB collection: {COLLECTION_NAME}")
    return collection

### Function to perform rag

In [10]:
def find_relevant_codes(collection, excerpt):

    # The query method embeds the excerpt' and compares its vector against all vectors in the collection.
    results = collection.query(
        query_texts=[excerpt],
        n_results=TOP_K_RESULTS,
        include=['metadatas', 'distances', 'documents']
    )
    relevant_codes = []

    if results and 'metadatas' in results and results['metadatas']:
        for i in range(len(results['metadatas'][0])):
            metadata = results['metadatas'][0][i]
            distance = results['distances'][0][i]
            description = results['documents'][0][i]

            relevant_codes.append({
                'icd9_code': metadata['icd9_code'],
                'similarity_score': round(1 - distance, 4), # 1 - distance gives a score closer to 1 for high similarity
                'original_description': description
            })

    return relevant_codes

### Create ICD9 code vector database

In [11]:
icd9_codes_collection = setup_chroma_db_and_load_data(icd_dignoses_df)

Initializing ChromaDB with model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded 14567 records from data source.
Splitting data into 3 batches of size up to 5000.
  -> Processing batch 1/3 (Size: 5000)
  -> Processing batch 2/3 (Size: 5000)
  -> Processing batch 3/3 (Size: 4567)

Successfully added ALL data to ChromaDB collection: icd_codes_collection


### Fucntion to break text into sentences

In [12]:
def segment_text_into_sentences(text):
    # Ensure the required tokenizer data is downloaded
    try:
        nltk.data.find('tokenizers/punkt_tab')
    except LookupError:
        nltk.download('punkt_tab')

    # Use sent_tokenize to split the text
    sentences = nltk.sent_tokenize(text)

    return sentences

### Take the top 5 text description of patient discharge data for ICD code extraction

In [13]:
discharge_notes_df = pd.read_csv(path1+"/mimic_iv_summarization_test_dataset_shortened.csv")
excerpts = discharge_notes_df['text'].head(5).tolist()

### Extract IDC codes using

In [14]:
icd9_code_dict = {}
icd9_code_list = []
for excerpt in excerpts:
  icd9_code_set = set()
  for sentence in segment_text_into_sentences(excerpt):
    recommended_codes = find_relevant_codes(icd9_codes_collection, sentence)[0]
    if recommended_codes['similarity_score'] < 0.5:
      continue
    icd9_code_dict[recommended_codes['icd9_code']] = recommended_codes['original_description']
    icd9_code_set.add(recommended_codes['icd9_code'])
  icd9_code_list.append(list(icd9_code_set))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### Display extracted ICD codes for the original text

In [15]:
for i, excerpt in enumerate(excerpts):
  print('-------------------------------------------------------------------Original text---------------------------------------------------------')
  print(excerpt)
  print('---------------------------------------------------------------Extracted ICD codes-------------------------------------------------------')
  for icd9_code in icd9_code_list[i]:
    print(f"{icd9_code} : {icd9_code_dict[icd9_code]}" )

-------------------------------------------------------------------Original text---------------------------------------------------------
 
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___
 
Chief Complaint:
Worsening ABD distension and pain 
 
Major Surgical or Invasive Procedure:
Paracentesis

 
History of Present Illness:
___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, 
bioplar, PTSD, presented from OSH ED with worsening abd 
distension over past week.  
Pt reports self-discontinuing lasix and spirnolactone ___ weeks 
ago, because she feels like "they don't do anything" and that 
she "doesn't want to put more chemicals in her." She does not 
follow Na-restricted diets. In the past week, she notes that she 
has been having worsening abd distension and discomfort. She 
denies ___ edem