In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sentence_transformers import SentenceTransformer

### Filling packing_form column 

In [2]:
def determine_packaging(salt):
    """Determine packaging form based on dosage information in salts"""
    if pd.isna(salt):
        return np.nan
    
    patterns = [
        (r'\b(mg|g|mcg)\b(?!.*/|\d+ml)', ['STRIP', 'Strip | Tablet', 'Strip TABLET', 'Strip | Capsule']),
        (r'\b(mg|g|mcg)/tablet\b', ['Strip | Tablet', 'STRIP']),
        (r'\b(mg|g|mcg)/ml\b', ['BOTTLE', 'Bottle | Suspension', 'Bottle SYRUP', 'Bottle | Syrup']),
        (r'\b%\s*(w/v|w/w)?\b', ['TUBE', 'BOTTLE', 'Bottle | Lotion']),
        (r'\b(iu|units?)/ml?\b', ['VIAL', 'AMPOULE', 'Vial | Injection']),
        (r'\b(ml|litre)\b', ['BOTTLE', 'Bottle | Solution', 'Bottle | Liquid']),
        (r'\b(injection|inj)\b', ['AMPOULE', 'Vial | Injection', 'INJECTION']),
        (r'\b(cream|ointment)\b', ['TUBE', 'Tube | Cream']),
        (r'\b(drops|solution)\b', ['Bottle | Drops', 'Bottle | Solution']),
        (r'\b(gel|shampoo)\b', ['Tube | Gel', 'Bottle | Shampoo']),
        (r'\b(powder|sachet)\b', ['SACHET', 'Packet | Suspension']),
    ]
    

    for pattern, packaging_options in patterns:
        if re.search(pattern, str(salt).lower()):

            for option in packaging_options:
                if option in valid_packaging:
                    return option
                

    return np.random.choice(['STRIP', 'BOTTLE', 'TUBE'], p=[0.5, 0.3, 0.2])


valid_packaging = [
    'STRIP', 'Strip | Tablet', 'Bottle | Suspension', 'BOTTLE',
    'Bottle SYRUP', 'Bottle | Syrup', 'Strip TABLET', 'AMPOULE',
    'Bottle | Infusion', 'Bottle | Oral Drops', 'Bottle | Spray',
    'INJECTION', 'TUBE', 'Vial | Injection', 'VIAL', 'Tube | Cream',
    'Bottle | Nasal Spray', 'Bottle | Nasal Drops', 'Tube | Gel',
    'Box | Injection', 'Tube | Mouth Gel', 'Tube GEL',
    'Bottle | Eye Drops', 'Vial INJECTION', 'Strip',
    'Bottle | Oral Suspension', 'Strip | Capsule', 'Tube',
    'Ampoule | Injection', 'Tube | Ointment', 'Packet | Injection',
    'Bottle', 'Bottle | Drops', 'Bottle | Solution', 'DRY VIAL',
    'Bottle | Oral Solution', 'Bottle | Eye/Ear Drops',
    'Bottle SUSPENSION', 'SACHET', 'Packet | Suspension', 'DEVICE',
    'BOX', 'Pack | Soap', 'Bottle | Dusting Powder', 'DROPS',
    'Tube | Shampoo', 'Bottle | Shampoo', 'CONTAINER',
    'Bottle | Lotion', 'PACK', 'PACKET', 'Bottle | Liquid',
    'Bottle | Tablet', 'Vial | Infusion', 'Bottle | Injection'
]

def fill_packaging_forms(df):
    """Main function to fill missing packaging forms"""

    df['salts'] = df['salts'].str.lower().fillna('')
    

    packaging_dist = df['packaging_form'].value_counts(normalize=True)
    

    mask = df['packaging_form'].isna()
    df.loc[mask, 'packaging_form'] = df.loc[mask, 'salts'].apply(
        lambda x: determine_packaging(x) if pd.notna(x) else np.random.choice(
            packaging_dist.index.tolist(), 
            p=packaging_dist.values.tolist()
        )
    )
    
    return df

### standardizing salt column 

In [3]:
def clean_salts_step1(salt):
    """Initial cleaning of salt strings."""
    if pd.isna(salt):
        return ""
    return salt.lower().strip()

def clean_salts_step2(salt):
    """Remove special characters, fix spacing, and standardize separators."""
    salt = re.sub(r'\b(na|n/a)\b', '', salt, flags=re.IGNORECASE) 
    salt = re.sub(r'[^a-z0-9.%/()+-,]', ' ', salt) 
    salt = re.sub(r'\s+', ' ', salt).strip() 
    salt = salt.replace('-', ' ')  
    salt = salt.replace('+', ',') 
    salt = re.sub(r'(\d+)\s*/\s*(\d+)', r'\1/\2', salt)  
    return salt


SALT_NAME_MAPPING = {
    'benzhexol hydrochloride': 'trihexyphenidyl',
    'trihexyphenidyl': 'trihexyphenidyl',
    'chlorpheniramine': 'chlorphenamine',
    'chlorphenamine maleate': 'chlorphenamine',
    'mecobalamin': 'vitamin b12',
    'cyanocobalamin': 'vitamin b12',
    'methylcobalamin': 'vitamin b12',
    'soluble insulin': 'human insulin',
    'linseed oil': 'linseed',
    'lignocaine': 'lidocaine',
    'hyoscine butylbromide': 'scopolamine',
    'vitamin d3': 'cholecalciferol',
    'benzhexol': 'trihexyphenidyl',
    'drotaverine': 'drotavarine',
    'levocloperastine fendizoate': 'levocloperastine',
    'serrati': 'serratiopeptidase',
    'serrat': 'serratiopeptidase',
    'diclofenac na': 'diclofenac',
    'na': '',
    'n/a': '',
    'acetaminophen': 'paracetamol',
    'rabeprazol': 'rabeprazole',
    'thiocolchic': 'thiocolchicoside',
    'mefenam': 'mefenamic acid',
    'piroxic': 'piroxicam',
    'chlorzox': 'chlorzoxazone'
}


def convert_to_mg(value, unit):
    """Convert grams to milligrams."""
    if 'g' == unit.lower() and not unit.startswith('m'):
        try:
            value_float = float(value)
            value = str(int(value_float * 1000))
            unit = 'mg'
        except ValueError:
            pass
    return value, unit


def process_salt_component(component):
    """Process a single salt component to standardize name and dosage."""
    component = component.strip().lower()
    dosage = None
    original_name = component


    paren_match = re.search(r'\((.*?)\)', component)
    if paren_match:
        dosage_str = paren_match.group(1)
        component = component.split('(')[0].strip()

        dosage_match = re.match(r'(\d+\.?\d*)\s*([a-z/%/0-9]+)', dosage_str)
        if dosage_match:
            value = dosage_match.group(1)
            unit = dosage_match.group(2).replace(' ', '')

            if '.' in value:
                value = value.rstrip('0').rstrip('.')

            value, unit = convert_to_mg(value, unit)
            dosage = f"{value}{unit}"
    else:

        dosage_match = re.search(r'(\d+\.?\d*)\s*([a-z/%/0-9]+)\s*$', component)
        if dosage_match:
            value = dosage_match.group(1)
            unit = dosage_match.group(2).replace(' ', '')
            if '.' in value:
                value = value.rstrip('0').rstrip('.')

            value, unit = convert_to_mg(value, unit)
            dosage = f"{value}{unit}"
            component = component[:dosage_match.start()].strip()


    alternatives = [a.strip() for a in component.split('/')]
    primary_name = None


    for alt in alternatives:
        if alt in SALT_NAME_MAPPING:
            primary_name = SALT_NAME_MAPPING[alt]
            break


    if not primary_name:
        primary_name = alternatives[0]

        primary_name = SALT_NAME_MAPPING.get(primary_name, primary_name)

    if dosage:
        dosage = re.sub(r'%w/w?/?v?', '%', dosage)  # Standardize percentages
        dosage = re.sub(r'gms?', 'mg', dosage)  # Standardize grams to mg

    # Remove redundant strength info
    primary_name = re.sub(r'\s+(\d+\.?\d*)\s*[a-z/%]+$', '', primary_name).strip()

    # Combine name and dosage
    if dosage and dosage != '0':
        return f"{primary_name} {dosage}"
    return primary_name

# Function to process the entire salts string
def process_salts(salt_str):
    """Process the entire salts string by splitting and processing components."""
    if pd.isna(salt_str) or not str(salt_str).strip():
        return salt_str

    components = [c.strip() for c in str(salt_str).split(',')]
    processed = []

    for comp in components:
        processed_comp = process_salt_component(comp)
        if processed_comp:
            processed.append(processed_comp)

    return ', '.join(processed)



In [4]:
class FormulationClusterer:
    def __init__(self):
        # Use a biomedical sentence transformer model
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        
    def preprocess(self, texts):
        """Standardize formulation strings"""
        processed = []
        for text in texts:
            text = str(text).lower().replace('%', ' percent').replace('/', ' per ')
            text = re.sub(r'[^a-z0-9\s]', ' ', text)
            text = re.sub(r'\s+', ' ', text).strip()
            processed.append(text)
        return processed
    
    def get_embeddings(self, texts):
        """Convert text to embeddings"""
        return self.model.encode(texts, convert_to_tensor=True).cpu().numpy()
    
    def cluster_formulations(self, formulations, threshold=0.3):
        """Perform hierarchical clustering"""
        cleaned = self.preprocess(formulations)
        embeddings = self.get_embeddings(cleaned)
        
        dist_matrix = 1 - cosine_similarity(embeddings)
        
        linkage_matrix = linkage(dist_matrix, 'average')
        
        clusters = fcluster(linkage_matrix, threshold, criterion='distance')
        
        return clusters, linkage_matrix

    def print_clusters(self, formulations, clusters):
        """Display clustered formulations"""
        cluster_dict = defaultdict(list)
        for formulation, cluster_id in zip(formulations, clusters):
            cluster_dict[cluster_id].append(formulation)
            
        for cluster_id, members in cluster_dict.items():
            print(f"\nCluster {cluster_id}:")
            print("-" * 40)
            print("\n".join(members))



In [8]:
df = pd.read_csv('medlr_assignment_dataset.csv')

df['salts'] = df['salts'].apply(clean_salts_step1)
df['salts'] = df['salts'].apply(clean_salts_step2)
df['salts'] = df['salts'].str.lower().apply(process_salts)

df = fill_packaging_forms(df)

df["salts"] = df["salts"].fillna("Unknown")
df = df[df["salts"] != "Unknown"]

sample_salts = df["salts"].unique().tolist()

clusterer = FormulationClusterer()
clusters, linkage_matrix = clusterer.cluster_formulations(sample_salts)

# Assign clusters to DataFrame
cluster_mapping = {salt: cluster for salt, cluster in zip(sample_salts, clusters)}
df["cluster"] = df["salts"].map(cluster_mapping)


# Print clustered results
clusterer.print_clusters(sample_salts, clusters)


Cluster 100:
----------------------------------------
paracetamol 1000mg

Cluster 102:
----------------------------------------
paracetamol 120mg
paracetamol 150mg
paracetamol 125mg

Cluster 91:
----------------------------------------
paracetamol 120mg/5ml
paracetamol 250mg/5ml
paracetamol 125mg/5ml

Cluster 132:
----------------------------------------
paracetamol 5ml

Cluster 92:
----------------------------------------
paracetamol 156.25mg/5ml

Cluster 104:
----------------------------------------
paracetamol 156.25mg

Cluster 103:
----------------------------------------
paracetamol 250mg

Cluster 98:
----------------------------------------
paracetamol 500mg

Cluster 99:
----------------------------------------
paracetamol 650mg

Cluster 101:
----------------------------------------
paracetamol 100mg

Cluster 151:
----------------------------------------
phenylephrine 10mg, chlorphenamine 2mg, paracetamol 325mg

Cluster 148:
----------------------------------------
chlorphenamin

In [10]:
df.to_csv('medlr_assignment_dataset_processed.csv', index=False)