In [3]:
#First execute this code. The output file generated from this code serves as the input for the second code below.
import pandas as pd
import re

# Load the medication dataset
medication_data_path = 'Discharge_medications.csv'
medication_data = pd.read_csv(medication_data_path)


# Load the cleaned medications dataset
cleaned_medications_path = 'cleaned_medications.xlsx'  
cleaned_medications = pd.read_excel(cleaned_medications_path,sheet_name='1313MOD')
cleaned_medications_list = cleaned_medications['cleaned_medication_name'].tolist()

# Function to parse and extract medication descriptions
def parse_medication_description(description, medication_name):
    # Combine all medication names into a regex pattern for the next medication
    next_med_pattern = "|".join(r"\b" + re.escape(med) + r"\b" for med in cleaned_medications_list if med != medication_name)
    
    # Capture everything until the next medication, and allow multiple matches
    pattern = rf"(\b{re.escape(medication_name)}\b.*?)(?=\b{next_med_pattern}\b|$)"
    matches = re.findall(pattern, description, re.DOTALL | re.IGNORECASE)

    if matches:
        # Combine all matches for the medication description
        extracted_description = "\n".join(match.strip() for match in matches)
        return extracted_description
    return ""

# Apply the function to extract the medication descriptions
medication_data['extracted_discharge_description'] = medication_data.apply(
    lambda row: parse_medication_description(row['discharge_medication_description'], row['discharge_medications']),
    axis=1
)

# Save the structured DataFrame to a new CSV file
output_path = 'structured_discharge_dosage_data.xlsx'
medication_data.to_excel(output_path, index=False)

print(f"Data has been parsed and saved successfully to {output_path}!")


Data has been parsed and saved successfully to structured_discharge_dosage_data2.xlsx!


In [1]:
#second code
import pandas as pd
import re

# Load the additional keyword files
route_keywords_df = pd.read_excel('route.xlsx')
dose_keywords_df = pd.read_excel('orderable_drug_form.xlsx')

# Extract keywords from the files
route_keywords = route_keywords_df['Route'].dropna().tolist()  
dose_keywords = dose_keywords_df['Drug Form'].dropna().tolist()  

# Combine the keywords into the regex patterns
route_pattern_keywords = '|'.join(map(re.escape, route_keywords))
dose_pattern_keywords = '|'.join(map(re.escape, dose_keywords))

# Define the route pattern
route_pattern = (
    r'\b(?:PO|IV|IM|SC|subcutaneous|BUCC|SL|INH|ID|intranasal|intrathecal|rectal|PR|'
    r'topical|TOP|transdermal|oral|mouth|TD|ophthalmic|OPH|otic|OTIC|'
    r'vaginal|VAG|nasal|NAS|endotracheal|ET|intraosseous|IO|'
    r'intraarterial|IA|intraperitoneal|IP|intravesical|IVes|'
    r'intracardiac|IC|intracerebroventricular|ICV|epidural|EPI|'
    r'intracerebral|ICere|nebs|nebulizer' + route_pattern_keywords + r')\b'
)


# Define the dose pattern
dose_pattern = (
    r'\b(?:\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*(?:\(\d{1,3}(?:,\d{3})*(?:\.\d+)?\)\s*)?(gm|g|mg|mcg|ml|units|IU|tablet|tablets|capsule|capsules|tab|tabs|cap|caps|'
    r'syringe|patch|patches|drop|drops|spray|sprays|puff|puffs|\%|percent'
    r'pill|pills|unit|units|dose|doses|pack|packs|vial|vials|'
    r'suppository|suppositories|inhalation|inhalations|ml/hr|mg/kg|ml/kg|' + dose_pattern_keywords + r'))\b'
)



# Load the dataset with descriptions
descriptions_df = pd.read_excel('structured_discharge_dosage_data.xlsx') 

# Define the frequency pattern (unchanged)
frequency_pattern = (
    r'\b(bid|tid|qid|qd|q4|HS|QHS|qod|prn|q[1-9]\d?h|q[1-9]\d?d|q[1-9]\d?w|every\s*\d+\s*(?:hr|hrs|hour|hours|day|days|week|weeks)|'
    r'once\s*a\s*day|twice\s*a\s*day|three\s*times\s*a\s*day|four\s*times\s*a\s*day|'
    r'at\s*bedtime|in\s*the\s*morning|in\s*the\s*evening|'
    r'before\s*meals|after\s*meals|with\s*meals|before\s*bed|on\s*an\s*empty\s*stomach|'
    r'as\s*needed|on\s*dialysis|with\s*dialysis|dialysis\s*days|non-dialysis\s*days|'
    r'every\s*\d+\s*hours|every\s*\d+\s*days|every\s*\d+\s*weeks|every\s*\d+\s*months|'
    r'morning|noon|afternoon|evening|night|bedtime|midnight|early\s*morning|'
    r'2\s*hours\s*before|2\s*hours\s*after|weekly|bi-weekly|monthly|daily|'
    r'twice\s*a\s*week|every\s*other\s*day|every\s*third\s*day|every\s*fourth\s*day|every\s*fifth\s*day|'
    r'every\s*weekend|every\s*weekday|every\s*morning|every\s*evening|'
    r'bid|b\.i\.d|tid|t\.i\.d|qid|q\.i\.d|qd|q\.d|q4|q\.4|qod|q\.o\.d|prn|p\.r\.n|'
    r'morning|noon|afternoon|evening|night|bedtime|midnight|early\s*morning|'
    r'dialysis|weekly|bi-weekly|monthly|daily|'
    r'morning|evening|afternoon|night|bedtime|midnight|'
    r'monday|tuesday|wednesday|thursday|friday|saturday|sunday|'
    r'once|twice|thrice|four|five|six|seven|eight|nine|ten)\b'
)

# Function to extract dose, route, and frequency
def extract_dose_route_frequency(description):
    dose = re.search(dose_pattern, description, re.IGNORECASE)
    route = re.search(route_pattern, description, re.IGNORECASE)
    frequency = re.search(frequency_pattern, description, re.IGNORECASE)

    return (dose.group(0) if dose else None,
            route.group(0) if route else None,
            frequency.group(0) if frequency else None)

# Apply the function to each row
descriptions_df[['dose', 'route', 'frequency']] = descriptions_df['extracted_discharge_description'].apply(
    lambda x: pd.Series(extract_dose_route_frequency(str(x)))
)

# Save the modified dataframe to a new Excel file
descriptions_df.to_excel('dosage_dischargemedication_instructions_with_details.xlsx', index=False)



Modified descriptions with dose, route, and frequency have been saved to 'dosage_instructions_with_details.xlsx'.
