In [4]:
import pandas as pd
# num2words installation: `pip install num2words`
from num2words import num2words  # Ensure installation: `pip install num2words`
import re  # For handling numeric values in dosage

In [2]:

# Define file paths
files = {
    "prescriptions.xlsx": "prescriptions_df",
    "patientinfo_stephany.xlsx": "patients_df",
    "bodymassindex.xlsx": "bmi_df",
    "IMDfile.xlsx": "imd_df"
}

# Load the DataFrames dynamically
dfs = {file: pd.read_excel(file) for file in files.keys()}  

# Assign explicit names for clarity
prescriptions_df = dfs["prescriptions.xlsx"]  
patients_df = dfs["patientinfo_stephany.xlsx"] 
bmi_df = dfs["bodymassindex.xlsx"]
imd_df = dfs["IMDfile.xlsx"]


# Clean and standardise patID
for df in [prescriptions_df, patients_df, imd_df]:
    df["patID"] = df["patID"].astype(str).str.strip().str.upper()

# Already loaded: bmi_df
bmi_df["patID"] = bmi_df["patID"].str.strip().str.upper()


# Average BMI per patient
bmi_agg_df = bmi_df.groupby("patID", as_index=False).agg({"bmi": "mean"})

# Merge
merged_df = prescriptions_df.merge(patients_df, on="patID", how="left")
merged_df = merged_df.merge(bmi_agg_df, on="patID", how="left")
merged_df = merged_df.merge(imd_df, on="patID", how="left")


# Save result
merged_df.to_excel("merged_data.xlsx", index=False)
print("Merged data saved to 'merged_data.xlsx'")



Merged data saved to 'merged_data.xlsx'


In [6]:
missing_patients = set(patients_df["patID"]) - set(prescriptions_df["patID"])
print(f"Patients missing from merge: {len(missing_patients)}")


Patients missing from merge: 0


In [8]:
# Clean and standardise IDs
prescriptions_df["patID"] = prescriptions_df["patID"].str.strip().str.upper()
bmi_df["patID"] = bmi_df["patID"].str.strip().str.upper()

# Which IDs match?
matching_ids = set(prescriptions_df["patID"]) & set(bmi_df["patID"])
nonmatching_ids = set(prescriptions_df["patID"]) - set(bmi_df["patID"])

print(f"Matching patIDs: {len(matching_ids)}")
print(f"Prescriptions with no matching BMI: {len(nonmatching_ids)}")

# Optionally preview unmatched
print("Example patIDs not in BMI data:", list(nonmatching_ids)[:5])







Matching patIDs: 664
Prescriptions with no matching BMI: 336
Example patIDs not in BMI data: ['HF20240001129467', 'HF20240001129463', 'HF20240001027575', 'HF20240001141640', 'HF2024000796916']


In [10]:
# Load the data
file_path = "merged_data.xlsx"  
df = pd.read_excel(file_path)

# Ensure column names are correct
quantity_col = "quantity"  # Update as needed
duration_col = "duration"  # Update as needed
dosage_col = "dosage"  # Update as needed
dosage_desc_col = "dosage description"  # Column for standardized descriptions

# Mapping abbreviations to meaningful text
dosage_mapping = {
    "DLY": "Once daily",
    "OD": "Once daily",
    "BD": "Twice daily",
    "TDS": "Three times daily",
    "QDS": "Four times daily",
    "AM": "In the morning",
    "MANE": "In the morning",
    "OM": "One in the morning",
    "ON": "At night",
    "NOCTE": "At night",
    "PRN": "As needed",
}

# Function to convert numeric dosage to words
def dosage_to_text(quantity, duration):
    try:
        if pd.isna(quantity) or pd.isna(duration) or duration == 0:
            return None  # Return None for missing or zero duration values
        
        dosage = float(quantity) / float(duration)  # Convert to float
        rounded_dosage = round(dosage, 1)  # Round to nearest 0.5 increment
        
        if rounded_dosage.is_integer():
            return f"{num2words(int(rounded_dosage))} to be taken daily"
        elif rounded_dosage == 0.5:
            return "Half to be taken daily"
        elif rounded_dosage % 1 == 0.5:
            return f"{num2words(int(rounded_dosage))} and a half to be taken daily"
        else:
            return f"{rounded_dosage} tablet(s) to be taken daily"
    
    except ValueError:
        return None  # Keeps original values unchanged

# Function to standardize abbreviations and process numeric doses
def standardize_dosage(dosage):
    if pd.isna(dosage):
        return None

    dosage = str(dosage).strip().upper()  # Convert to uppercase for consistency

    # Check for numeric doses combined with a time indicator (e.g., "2 AM")
    match = re.match(r"(\d+)\s*(AM|PM|OM|ON|MANE|NOCTE)", dosage)
    if match:
        num_part = int(match.group(1))
        time_part = dosage_mapping.get(match.group(2), match.group(2).lower())
        return f"{num2words(num_part)} to be taken {time_part}"

    return dosage_mapping.get(dosage, dosage)  # Replace abbreviations

# Fill missing dosages and standardize all
df[dosage_desc_col] = df.apply(
    lambda row: dosage_to_text(row[quantity_col], row[duration_col]) 
    if pd.isna(row[dosage_col]) else standardize_dosage(row[dosage_col]), axis=1
)

# Save the modified data
cleaned_file_path = "updated_patient_data.xlsx"
df.to_excel(cleaned_file_path, index=False)

print(f"Modified data saved")


Modified data saved


In [12]:
# Load the dataset
file_path = "updated_patient_data.xlsx"
df = pd.read_excel(file_path)

# Define relevant columns
drugname_col = "drugname"
dosage_desc_col = "dosage description"
strength_col = "Strength (mg)"
min_daily_dose_col = "Min Daily Dose (mg)"
max_daily_dose_col = "Max Daily Dose (mg)"

# Function to extract numeric strength (mg) from drug name
def extract_strength(drugname):
    match = re.search(r"(\d+)\s*mg", str(drugname))  # Extracts number before 'mg'
    return int(match.group(1)) if match else None  # Returns strength in mg

# Optimized function to interpret dosage description
def extract_doses(description):
    if pd.isna(description):
        return (None, None)

    description = description.strip().upper()

    # Expanded fixed mappings
    fixed_mapping = {
        # Standard daily doses
        "ONE TO BE TAKEN DAILY": (1, 1),
        "TWO TO BE TAKEN DAILY": (2, 2),
        "THREE TO BE TAKEN DAILY": (3, 3),
        "FOUR TO BE TAKEN DAILY": (4, 4),
        "FIVE TO BE TAKEN DAILY": (5, 5),
        "SIX TO BE TAKEN DAILY": (6, 6),
        "SEVEN TO BE TAKEN DAILY": (7, 7),
        "EIGHT TO BE TAKEN DAILY": (8, 8),
        "NINE TO BE TAKEN DAILY": (9, 9),
        "TEN TO BE TAKEN DAILY": (10, 10),
        "SIXTEEN TO BE TAKEN DAILY": (16, 16),
        "TWENTY-EIGHT TO BE TAKEN DAILY": (28, 28),
        "FIFTY-SIX TO BE TAKEN DAILY": (56, 56),
        "ZERO TO BE TAKEN DAILY": (0, 0),
        "1 TABLET ONCE A DAY": (1, 1),

        # Variations of daily doses
        "ONE DAILY": (1, 1),
        "EVERY DAY": (1, 1),
        "ONE A DAY": (1, 1),
        "ONCE DAILY": (1, 1),
        "TAKE ONE DAILY": (1, 1),
        "ONE TABLET EVERY DAY": (1, 1),
        "ONE DAILY AT 8AM": (1, 1),
        "TAKE ONE EVERY DAY": (1, 1),
        "ONE ONCE DAILY": (1, 1),
        "ONE TO BE TAKEN ONCE DAILY": (1, 1),
        "TWO TABLETS DAILY": (2, 2),
        "2 TABLET ONCE A DAY": (2, 2),
        "TAKE ONE ONCE DAILY": (1, 1),
        "TWO DAILY": (2, 2),
        "1 A DAY": (1, 1),
        

        # Specific times of day
        "ONE TO BE TAKEN IN THE MORNING": (1, 1),
        "TAKE ONE EACH MORNING": (1, 1),
        "ONE DAILY IN THE MORNING": (1, 1),
        "TAKE TWO EACH MORNING": (2, 2),
        "TWO TO BE TAKEN EACH MORNING": (2, 2),
        "TWO TABLETS IN THE MORNING": (2, 2),
        "ONE TO BE TAKEN EACH DAY": (1, 1),
        "ONE TO BE TAKEN EACH MORNING": (1, 1),
        "ONE IN THE MORNING": (1, 1),
        "IN THE MORNING": (1, 1),
        "1 IN MORNING": (1, 1),
        "ONE IN A MORNING": (1, 1),
        "ONE MANE": (1, 1),  # Common abbreviation for "Morning"
        "TAKE ONE IN THE MORNING": (1, 1),
        "ONE TABLET IN THE MORNING": (1, 1),
        "TAKE ONE TABLET IN THE MORNING": (1, 1),
        "TAKE ONE IN THE MORNING AND ONE AT LUNCH": (2, 2),
        "ONE IN THE MORNING AND ONE AT MIDDAY": (2, 2),
        "TAKE ONE IN THE MORNING AND ONE AT 2 PM": (2, 2),
        "ONE AT 6 PM": (1, 1),
        "1 TABLET AM": (1, 1),
        "1 NOON": (1, 1),
        "1 LUNCH TIME": (1, 1),
        "1 LUNCHTIME": (1, 1),
        "ONE AT LUNCHTIME": (1, 1),
        "TWO TO BE TAKEN IN THE MORNING": (2, 2),
        "1 IN AFTERNOON": (1, 1),
        "TAKE TWO IN THE MORNING": (2, 2),
        "two to be taken In the morning": (2, 2),
        "TAKE ONE PM": (1, 1),
        "ONE EACH MORNING": (1, 1),
        "THREE TO BE TAKEN IN THE MORNING": (3,3),
        "2 EVERY MORNING": (2, 2),
        "1 EVERY MORNING": (1, 1),
        "ONE TABLET ONCE IN THE MORNING": (1, 1),
        "1 MORNING": (1, 1),
        "TAKE ONE TABLET EACH MORNING": (1, 1),
        "1 TABLET ONCE IN THE MORNING": (1, 1),
        "one to be taken One in the morning": (1, 1), #once in the morning  
        "two to be taken One in the morning": (2, 2), #once in the morning
        

        # Twice daily doses
        "ONE TO BE TAKEN TWICE A DAY": (2, 2),
        "TWO TO BE TAKEN TWICE A DAY": (2*2, 2*2),
        "THREE TO BE TAKEN TWICE DAILY": (3*2, 3*2),
        "USE TWICE DAILY": (2, 2),
        "TWICE DAILY": (2, 2),
        "TAKE TWO DAILY": (2, 2),
        "TAKE TWO ONCE DAILY": (2, 2),
        "ONE TWICE A DAY": (2, 2),
        "ONE TABLET TWICE A DAY": (2, 2),
        "TAKE TWO TABLETS ONCE DAILY": (2, 2),
        "TAKE TWO TABLETS IN THE MORNING": (2, 2),
        "2 TABLETS IN THE MORNING": (2, 2),
        "ONE IN THE MORNING AND ONE AT LUNCHTIME": (2, 2),
        "TAKE ONE TWICE DAILY": (2, 2),
        "TWICE A DAY": (2, 2),
        

        # Special scheduling
        "ONE TO BE TAKEN ON ALTERNATE DAYS": (0.5, 0.5),
        
        # Fractional and range doses
        "ONE OR TWO TO BE TAKEN DAILY": (1, 2),
        "HALF TO BE TAKEN DAILY": (0.5, 0.5),
        "ONE AND A HALF TO BE TAKEN DAILY": (1.5, 1.5),
        "TWO AND A HALF TO BE TAKEN DAILY": (2.5, 2.5),
        "HALF A TABLET EVERY DAY": (0.5, 0.5),
        "HALF A TABLET EVERY MORNING": (0.5, 0.5),
        "HALF TABLET ONCE DAILY": (0.5, 0.5),
    
        # Abbreviations
        "1 TAB": (1, 1),
        "1 OD": (1, 1),
        "1 BD": (2, 2),
        "ONE BD": (2, 2),
        "2 BD": (2, 2),
        "2 A DAY": (2, 2),
        "3 OD": (3, 3),
        "1 TDS PRN": (3, 3),
        "2 DIE": (2, 2),  # Likely abbreviation for BID (twice daily)
        "1-OD": (1, 1),
        "1-BD": (2, 2),
        "1 MONE": (1, 1), # Likely abbreviation for MANE (in the morning)
        "1M": (1, 1),

        # As directed cases (fallback to None)
        "USE AS DIRECTED": (None, None),
        "AS DIRECTED": (None, None),
        "DAILY AS NEEDED": (1, 1),  # Assuming standard daily dose unless PRN logic applies
        "AS DIRECTED BY HOSP": (None, None),
        "TAKE ONE AS DIRECTED": (None, None),
        "TAKE ONE EACH MORNING AS REQUIRED": (1, 1),
        "ONE TO BE TAKEN IN THE MORNING PRN": (1, 1),  # Assuming daily unless PRN logic applies
        "TAKE ONE EACH MORNING WHEN REQUIRED": (1, 1),
        "TAKE ONE AS DIRECTED": (1, 1),
        "1 TABLET AS DIRECTED": (1, 1),
    
        # Additional volume-based doses
        "5ML BD": (5*2, 5*2),  # Twice daily multiplier
        "10 MLS EVERY MORNING": (10, 10),
    }




    if description in fixed_mapping:
        return fixed_mapping[description]

    # Regex for structured phrases
    match = re.search(r"(\d+(\.\d+)?)\s?(TABLET\(S\)|TABLET|TAB|TABS)?\s?(TO BE TAKEN|TAKE)\s?(DAILY|TWICE DAILY|ONCE DAILY|IN THE MORNING|EACH MORNING)", description)
    if match:
        dose = float(match.group(1))
        frequency = match.group(6) if match.lastindex and match.lastindex >= 6 else None  # Ensure group exists
        multiplier = 1 if frequency and ("ONCE" in frequency or "DAILY" in frequency) else 2
        return (dose * multiplier, dose * multiplier)


    # Handle fractional doses (e.g., "ONE AND A HALF TO BE TAKEN DAILY")
    match = re.search(r"(\d+(\.\d+)?)(\s?AND\s?\d+/\d+)?\s?(TABLET\(S\)|TABLET|TAB|TABS)?\s?(DAILY|IN THE MORNING|EACH DAY|EVERY DAY)", description)
    if match:
        base_dose = float(match.group(1))
        fraction = match.group(3)
        if fraction:
            fraction_value = sum(map(float, fraction.replace("AND", "").split("/")))
            base_dose += fraction_value
        return (base_dose, base_dose)

    # Catch cases like "TAKE X IN THE MORNING"
    match = re.search(r"TAKE (\d+(\.\d+)?)", description)
    if match:
        dose = float(match.group(1))
        return (dose, dose)

    # Catch large doses ("TWENTY-EIGHT TO BE TAKEN DAILY")
    match = re.search(r"(\d+)\s?TO BE TAKEN DAILY", description)
    if match:
        dose = float(match.group(1))
        return (dose, dose)

    match = re.search(r"(\d+(\.\d+)?|\bHALF\b)\s?(TABLET\(S\)|TAB|TABS|ML)?\s?(DAILY|TWICE DAILY|IN THE MORNING|EACH MORNING|ONCE A DAY)", description)
    if match:
        dose = 0.5 if "HALF" in match.group(1) else float(match.group(1))
        return (dose, dose)

   
    return (None, None)

# Apply extraction functions
df[strength_col] = df[drugname_col].apply(extract_strength)
df[["Min Doses Per Day", "Max Doses Per Day"]] = df[dosage_desc_col].apply(extract_doses).apply(pd.Series)

# Calculate total daily dose range
df[min_daily_dose_col] = df[strength_col] * df["Min Doses Per Day"]
df[max_daily_dose_col] = df[strength_col] * df["Max Doses Per Day"]

print(df["sex_x"].value_counts(dropna=False))


# Save the updated dataset
df.to_excel("updated_patient_data.xlsx", index=False)


print(f"Updated data saved")


sex_x
Female    26817
Male      18739
Name: count, dtype: int64
Updated data saved


In [13]:
# Identify new unmatched descriptions for review
unmatched = df[df["Min Doses Per Day"].isna()][dosage_desc_col].dropna().unique()
print("Unmatched dosage descriptions:")
print(unmatched)

Unmatched dosage descriptions:
['USE AS DIRECTED' 'AS DIRECTED' 'one to be taken One in the morning'
 'two to be taken One in the morning' 'AS DIRECTED BY HOSP']
