In [10]:
import os, re, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf

# Configuration
DATA_PATH = "indian_pharmaceutical_products_clean.csv" 
RND = 42
np.random.seed(RND)
tf.random.set_seed(RND)

# ---------------------------------------------------------
# 1. LOAD & PREPROCESS (Standard)
# ---------------------------------------------------------
if not os.path.exists(DATA_PATH):
    print(f"Error: File {DATA_PATH} not found.")
else:
    print("Loading data...")
    df = pd.read_csv(DATA_PATH)
    
    # Cleaning
    def normalize_text(s):
        s = re.sub(r'[\(\)\[\]\{\},;:/\\\|"]', ' ', str(s).lower())
        return re.sub(r'\s+', ' ', s).strip()

    def parse_strength(s):
        m = re.search(r'(\d+(?:\.\d+)?)\s*(mcg|mg|g|µg|iu)?', str(s).lower())
        if not m: return 0.0
        v = float(m.group(1)); unit = (m.group(2) or '').replace('µg','mcg')
        return v/1000.0 if unit == 'mcg' else (v*1000.0 if unit == 'g' else v)

    def parse_pack(size, unit):
        try:
            if not pd.isna(size): return int(size)
        except: pass
        m = re.search(r'(\d+)', str(unit).lower())
        return int(m.group(1)) if m else 1

    df['brand_clean'] = df['brand_name'].apply(normalize_text)
    df['strength_mg'] = df['primary_strength'].apply(parse_strength)
    df['pack_num'] = df.apply(lambda r: parse_pack(r.get('pack_size', pd.NA), r.get('pack_unit','')), axis=1)
    df['composition_text'] = (df['primary_ingredient'].fillna('') + ' ' + df['active_ingredients'].fillna('')).apply(normalize_text)
    df['text_for_emb'] = (df['brand_clean'] + ' ' + df['composition_text'] + ' ' + df['dosage_form'].fillna('').astype(str).str.lower())

    df = df[df['price_inr'].notna()].reset_index(drop=True)
    df['price_tier'] = pd.qcut(df['price_inr'], q=3, labels=[0,1,2]).astype(int)
    
    # ---------------------------------------------------------
    # 2. FEATURE ENGINEERING
    # ---------------------------------------------------------
    MAX_VOCAB = 10000; SEQ_LEN = 30; EMBED_DIM = 64
    vectorizer = TextVectorization(max_tokens=MAX_VOCAB, output_sequence_length=SEQ_LEN)
    vectorizer.adapt(df['text_for_emb'].astype(str).values)
    
    inp = layers.Input(shape=(1,), dtype=tf.string)
    x = vectorizer(inp)
    x = layers.Embedding(input_dim=len(vectorizer.get_vocabulary()), output_dim=EMBED_DIM, mask_zero=True)(x)
    x = layers.GlobalAveragePooling1D()(x) 
    enc = models.Model(inp, x)
    emb_vectors = enc.predict(df['text_for_emb'].astype(str).values, batch_size=1024, verbose=0)
    
    df['embedding'] = list(emb_vectors)
    EMB_DIM = emb_vectors.shape[1]

    le_man = LabelEncoder()
    df['manufacturer'] = df['manufacturer'].fillna('unknown').astype(str)
    manu_counts = df['manufacturer'].value_counts()
    df['manu_group'] = df['manufacturer'].apply(lambda x: x if manu_counts[x] > 10 else 'other')
    df['manu_id'] = le_man.fit_transform(df['manu_group'])
    
    le_dos = LabelEncoder()
    df['dosage_id'] = le_dos.fit_transform(df['dosage_form'].fillna('unknown').astype(str))
    
    features_numeric = ['pack_num', 'strength_mg']
    num_scaler = StandardScaler()
    df_scaled = df.copy()
    df_scaled[features_numeric] = num_scaler.fit_transform(df_scaled[features_numeric])

    # ---------------------------------------------------------
    # 3. SPLIT & PREPARE
    # ---------------------------------------------------------
    train_df, temp_df = train_test_split(df_scaled, test_size=0.25, random_state=RND, stratify=df_scaled['price_tier'])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=RND, stratify=temp_df['price_tier'])

    def make_inputs(d):
        return {
            'emb_input': np.vstack(d['embedding'].values).astype('float32'),
            'manu_input': d['manu_id'].astype('int32').values,
            'dosage_input': d['dosage_id'].astype('int32').values,
            'num_input': d[features_numeric].astype('float32').values
        }

    X_train, X_val, X_test = make_inputs(train_df), make_inputs(val_df), make_inputs(test_df)

    y_train_list = [train_df['price_tier'].values, np.log1p(train_df['price_inr'].values)]
    y_val_list = [val_df['price_tier'].values, np.log1p(val_df['price_inr'].values)]
    y_test_list = [test_df['price_tier'].values, np.log1p(test_df['price_inr'].values)]

    # Weights
    classes = np.unique(y_train_list[0])
    cw = class_weight.compute_class_weight('balanced', classes=classes, y=y_train_list[0])
    class_weight_dict = dict(enumerate(cw))
    
    sw_tier = np.array([class_weight_dict[y] for y in y_train_list[0]])
    sw_price = np.ones(len(y_train_list[1]))
    sample_weight_list = [sw_tier, sw_price]

    # ---------------------------------------------------------
    # 4. ARCHITECTURE: "Classification First" (Restored Original Depth)
    # ---------------------------------------------------------
    MANU_VOCAB = df_scaled['manu_id'].nunique() + 1
    DOSAGE_VOCAB = df_scaled['dosage_id'].nunique() + 1

    # Inputs
    emb_in = layers.Input(shape=(EMB_DIM,), name='emb_input')
    manu_in = layers.Input(shape=(), dtype='int32', name='manu_input')
    dosage_in = layers.Input(shape=(), dtype='int32', name='dosage_input')
    num_in = layers.Input(shape=(len(features_numeric),), dtype='float32', name='num_input')

    # Embeddings
    manu_emb = layers.Flatten()(layers.Embedding(MANU_VOCAB, 32)(manu_in))
    dos_emb = layers.Flatten()(layers.Embedding(DOSAGE_VOCAB, 8)(dosage_in))

    # --- THE ORIGINAL DEEP SHARED STRUCTURE (This got you 94%) ---
    # We bring back the full depth to ensure classification features are learned deeply
    x = layers.Concatenate()([emb_in, manu_emb, dos_emb, num_in])
    
    x = layers.Dense(256, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Dense(128, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Dense(64, activation='relu')(x)
    shared_features = layers.Dropout(0.2)(x)

    # --- HEAD A: Tier (Direct connection) ---
    tier_out = layers.Dense(3, activation='softmax', name='tier_output')(shared_features)

    # --- HEAD B: Price (Side Branch) ---
    # Small branch off the side. 
    price_branch = layers.Dense(32, activation='relu')(shared_features)
    price_out = layers.Dense(1, activation='linear', name='price_output')(price_branch)

    model = models.Model(inputs=[emb_in, manu_in, dosage_in, num_in], outputs=[tier_out, price_out])

    # --- CRITICAL PART: LOSS WEIGHTING ---
    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-3),
        loss={'tier_output': 'sparse_categorical_crossentropy', 'price_output': 'mse'},
        
        # STRATEGY: Focus 95% on Class, 5% on Price
        # This prevents price gradients from ruining classification
        loss_weights={'tier_output': 1.0, 'price_output': 0.05}, 
        
        metrics={'tier_output': 'accuracy', 'price_output': 'mae'}
    )

    # ---------------------------------------------------------
    # 5. TRAINING
    # ---------------------------------------------------------
    class Monitor(callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            if epoch % 5 == 0: gc.collect()
            
    es = callbacks.EarlyStopping(monitor='val_tier_output_accuracy', mode='max', patience=10, restore_best_weights=True)
    rlr = callbacks.ReduceLROnPlateau(monitor='val_tier_output_accuracy', mode='max', patience=4, factor=0.5, verbose=1)

    print("Starting training (Classification First)...")
    history = model.fit(
        x=X_train, y=y_train_list,
        sample_weight=sample_weight_list,
        validation_data=(X_val, y_val_list),
        epochs=50,
        batch_size=64, 
        callbacks=[es, rlr, Monitor()],
        verbose=1
    )

    # ---------------------------------------------------------
    # 6. RESULTS
    # ---------------------------------------------------------
    print("\n--- FINAL EVALUATION ---")
    preds = model.predict(X_test, batch_size=1024)
    
    # Tier
    acc = accuracy_score(y_test_list[0], np.argmax(preds[0], axis=1))
    print(f"Tier Accuracy: {acc:.4f}")
    print(classification_report(y_test_list[0], np.argmax(preds[0], axis=1)))
    
    # Price
    pred_price = np.expm1(preds[1].flatten())
    true_price = np.expm1(y_test_list[1])
    mae = mean_absolute_error(true_price, pred_price)
    print(f"Price MAE: ₹{mae:.2f}")
    
    model.save('final_multitask_model.h5')

Loading data...
Starting training (Classification First)...
Epoch 1/50
[1m2977/2977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 7ms/step - loss: 0.9473 - price_output_loss: 1.0564 - price_output_mae: 0.7236 - tier_output_accuracy: 0.5750 - tier_output_loss: 0.8945 - val_loss: 0.7566 - val_price_output_loss: 0.6348 - val_price_output_mae: 0.5402 - val_tier_output_accuracy: 0.6757 - val_tier_output_loss: 0.7255 - learning_rate: 0.0010
Epoch 2/50
[1m2977/2977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - loss: 0.7370 - price_output_loss: 0.5904 - price_output_mae: 0.5134 - tier_output_accuracy: 0.6909 - tier_output_loss: 0.7075 - val_loss: 0.6789 - val_price_output_loss: 0.5389 - val_price_output_mae: 0.4772 - val_tier_output_accuracy: 0.7169 - val_tier_output_loss: 0.6532 - learning_rate: 0.0010
Epoch 3/50
[1m2977/2977[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - loss: 0.6716 - price_output_loss: 0.5300 - price_output_mae: 0.4774 - t



Tier Accuracy: 0.7722
              precision    recall  f1-score   support

           0       0.79      0.80      0.80     10666
           1       0.69      0.70      0.69     10651
           2       0.84      0.82      0.83     10430

    accuracy                           0.77     31747
   macro avg       0.77      0.77      0.77     31747
weighted avg       0.77      0.77      0.77     31747

Price MAE: ₹178.44
