In [2]:
#Please execute this code first. The output of this step will serve as a input for the subsequent codes presented below.
#medications / rxnorm downloaded list
import pandas as pd
import re
from multiprocessing import Pool, cpu_count

# Load RxNorm data
rxnorm_data_path = 'rxnorm_medications_without_TTY_filter.csv'
rxnorm_data = pd.read_csv(rxnorm_data_path)

# Lowercase the RxNorm medication names and convert to a set for fast lookup
rxnorm_set = set(rxnorm_data['Medication Name'].str.lower())

# Function to extract phrases of up to five words
def extract_phrases(text):
    words = re.findall(r'\b\w+\b', text)
    phrases = set()  # Use set to avoid duplicate phrases
    for i in range(len(words)):
        for length in range(1, 6):  # Generate phrases of length 1 to 5
            if i + length <= len(words):
                phrases.add(' '.join(words[i:i+length]))
    return phrases

# Function to process each row and find matching phrases
def process_row(row):
    if pd.isna(row['discharge_medications']):
        return []
    medication_block = row['discharge_medications'].strip().lower()
    phrases = extract_phrases(medication_block)
    matched_phrases = []
    for phrase in phrases:
        if phrase in rxnorm_set:
            rxnorm_code = rxnorm_data.loc[rxnorm_data['Medication Name'].str.lower() == phrase, 'RxNorm ID'].values[0]
            matched_phrases.append((row['subject_id'], row['diagnosis'], phrase, rxnorm_code, row['intent']))
        else:
            matched_phrases.append((row['subject_id'], row['diagnosis'], phrase, '', row['intent']))
    return matched_phrases

# Function to apply processing to a chunk of data
def process_chunk(chunk):
    return chunk.apply(process_row, axis=1).tolist()

# Load the Excel file
file_path = 'Structured_format_output_file.csv'
data = pd.read_csv(file_path)

# Split data into chunks for parallel processing
num_chunks = cpu_count()
chunk_size = len(data) // num_chunks + 1
chunks = [data.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(num_chunks)]

# Use multiprocessing to process chunks in parallel
with Pool(num_chunks) as pool:
    results = pool.map(process_chunk, chunks)

# Flatten the list of results
processed_data = [item for sublist in results for subsublist in sublist for item in subsublist]

# Create DataFrame from the processed data
processed_df = pd.DataFrame(processed_data, columns=['subject_id', 'diagnosis', 'medication_name', 'rxnorm_code', 'intent'])

# Save the new dataframe to a CSV file
output_file_path = 'discharge_medications_with_rxnorm_codes_part1.csv'
processed_df.to_csv(output_file_path, index=False)

print(f"File saved to {output_file_path}")


File saved to discharge_medications_with_rxnorm_codes_wholeblock_draft2.xlsx


In [8]:
#For the left out ones, which didn't had any match with the rxnorm downloaded list, therefore rxnorm API was used.
#The output of the above code serves as the input for this code.

import pandas as pd
import aiohttp
import asyncio
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

# Apply the nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load the dataset
dataset_path = 'discharge_medications_with_rxnorm_codes_part1.csv'
data = pd.read_csv(dataset_path)

# Filter out medications with names shorter than 3 characters and save the discarded rows
discarded_data = data[data['medication_name'].str.len() < 3]
data = data[data['medication_name'].str.len() >= 3]

# Save discarded rows to a separate file
discarded_output_path = 'discharged_discarded_medications.csv'
discarded_data.to_csv(discarded_output_path, index=False)

# Asynchronous function to validate medication name and get RxNorm code using RxNorm API
rxnorm_cache = {}

async def fetch_rxnorm_code(name, session):
    if name in rxnorm_cache:
        return name, rxnorm_cache[name]
    
    url = f"https://rxnav.nlm.nih.gov/REST/rxcui.json?name={name}"
    try:
        async with session.get(url) as response:
            if response.status == 200:
                data = await response.json()
                if 'idGroup' in data and 'rxnormId' in data['idGroup']:
                    rxnorm_code = data['idGroup']['rxnormId'][0]
                    rxnorm_cache[name] = rxnorm_code
                    return name, rxnorm_code
    except Exception as e:
        print(f"Error fetching {name}: {e}")
    
    rxnorm_cache[name] = None
    return name, None

async def get_rxnorm_codes(medications):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_rxnorm_code(med, session) for med in medications]
        return await tqdm_asyncio.gather(*tasks, desc="Fetching RxNorm codes")

# Extract unique medications to reduce API calls
unique_medications = data['medication_name'].unique()

# Fetch RxNorm codes for unique medications
batch_size = 500 # Adjust batch size for optimal performance
rxnorm_codes = []

async def main():
    for i in range(0, len(unique_medications), batch_size):
        batch_medications = unique_medications[i:i + batch_size]
        batch_rxnorm_codes = await get_rxnorm_codes(batch_medications)
        rxnorm_codes.extend(batch_rxnorm_codes)

# Run the async main function
asyncio.run(main())

# Convert to a dictionary for quick lookup
rxnorm_codes_dict = {name: code for name, code in rxnorm_codes if code is not None}

# Map RxNorm codes to the dataset
data['rxnorm_code'] = data['medication_name'].map(rxnorm_codes_dict).fillna('')

# Ensure all subject IDs are included, even if they have no medications
all_subject_ids = data['subject_id'].unique()
final_df = pd.merge(pd.DataFrame(all_subject_ids, columns=['subject_id']), data, on='subject_id', how='left')

# Save the output to a new CSV file
output_path = 'discharge_medications_with_rxnorm_codes_part2.csv'
final_df.to_csv(output_path, index=False)

print(f"Output file saved to {output_path}")
print(f"Discarded rows saved to {discarded_output_path}")


Fetching RxNorm codes: 100%|██████████| 500/500 [00:03<00:00, 132.84it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 180.74it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 177.16it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 169.85it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 178.82it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 175.79it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:03<00:00, 144.29it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 179.91it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 180.93it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 169.03it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 176.11it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 170.32it/s]
Fetching RxNorm codes: 100%|██████████| 500/500 [00:02<00:00, 180.82it/s]
Fetching RxNorm codes: 100%|██████████

Output file saved to discharge_meds_output_with_rxnorm_codes_via_api_wholeblock_v1.xlsx
Discarded rows saved to discharged_discarded_medications_v1.csv


In [6]:
#The file generated from the second code above serves as the input for this code below
#To filter the rows with subtext
import pandas as pd

# Load the dataset
file_path = 'discharge_medications_with_rxnorm_codes_part2.csv'
data = pd.read_csv(file_path)

# Function to filter out rows where medication name is a subtext of another medication name
def filter_medications(group):
    medication_names = group['medication_name'].tolist()
    filtered_names = set(medication_names)  # Start with all names

    for name in medication_names:
        for other_name in medication_names:
            if name != other_name and name in other_name:
                filtered_names.discard(name)  # Remove name if it is a subtext of another name

    return group[group['medication_name'].isin(filtered_names)]

# Apply the filter function to each group of subject_id
filtered_data = data.groupby('subject_id').apply(filter_medications).reset_index(drop=True)

# Save the filtered dataset to a new CSV file
output_file_path = 'medication_order_output_file.csv'
filtered_data.to_csv(output_file_path, index=False)

print(f"Filtered dataset saved to {output_file_path}")


Filtered dataset saved to Medications_on_discharge_filtered_subsets.xlsx
