 # DL

In [1]:
# ==============================================================
# PROBLEM 1 — THERAPEUTIC CLASS PREDICTION
# Deep Learning Model: TextVectorization + Embedding + BiLSTM
# Dataset: /mnt/data/indian_pharmaceutical_products_clean.csv
# ==============================================================

import pandas as pd
import numpy as np
import re
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

# --------------------------------------------------------------
# 1. LOAD DATA
# --------------------------------------------------------------
DATA_PATH = "indian_pharmaceutical_products_clean.csv"
df = pd.read_csv(DATA_PATH)

print("Total rows:", len(df))

# Columns needed for problem 1
use_cols = ["brand_name", "active_ingredients", "primary_strength",
            "dosage_form", "therapeutic_class"]
df = df[use_cols].dropna().reset_index(drop=True)

print("After NA removal:", len(df))


# --------------------------------------------------------------
# 2. PREPROCESSING
# --------------------------------------------------------------
def safe_json_parse(x):
    """Safely parse JSON-like ingredient list."""
    try:
        return json.loads(str(x).replace("'", '"'))
    except:
        return x

def extract_ingredient_text(active_ing):
    parsed = safe_json_parse(active_ing)
    if isinstance(parsed, list):
        toks = []
        for item in parsed:
            if isinstance(item, dict):
                name = item.get("name", "")
                strength = item.get("strength", "")
                toks.append(f"{name} {strength}".strip())
        return " ".join(toks)
    return str(parsed)

def clean_text(t):
    t = str(t).lower()
    t = re.sub(r'[^a-z0-9\s/.-]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

df["brand_text"] = df["brand_name"].astype(str).apply(clean_text)
df["ing_text"] = df["active_ingredients"].apply(extract_ingredient_text).apply(clean_text)
df["strength_text"] = df["primary_strength"].astype(str).apply(clean_text)
df["dosage_text"] = df["dosage_form"].astype(str).apply(clean_text)

# Final text input for classification
df["text_input"] = (
    df["brand_text"] + " | " +
    df["ing_text"] + " | " +
    df["strength_text"] + " | " +
    df["dosage_text"]
)

print("Example text_input:")
print(df["text_input"].head())


# --------------------------------------------------------------
# 3. ENCODE TARGET LABELS
# --------------------------------------------------------------
le = LabelEncoder()
df["label"] = le.fit_transform(df["therapeutic_class"])
num_classes = len(le.classes_)

print("Number of classes:", num_classes)


# --------------------------------------------------------------
# 4. TRAIN / TEST SPLIT
# --------------------------------------------------------------
X = df["text_input"].astype(str).values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42, stratify=y_train
)

print(len(X_train), len(X_val), len(X_test))


# --------------------------------------------------------------
# 5. TEXT VECTORIZATION
# --------------------------------------------------------------
MAX_TOKENS = 20000
MAX_LEN = 60

vectorizer = layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_sequence_length=MAX_LEN,
    standardize=None
)

vectorizer.adapt(X_train)


# --------------------------------------------------------------
# 6. BUILD BiLSTM CLASSIFIER
# --------------------------------------------------------------
EMBED_DIM = 128
LSTM_UNITS = 128

inp = layers.Input(shape=(1,), dtype=tf.string)

x = vectorizer(inp)
x = layers.Embedding(MAX_TOKENS, EMBED_DIM, mask_zero=True)(x)
x = layers.Bidirectional(layers.LSTM(LSTM_UNITS))(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(num_classes, activation="softmax")(x)

model = models.Model(inputs=inp, outputs=out)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


# --------------------------------------------------------------
# 7. TRAIN MODEL
# --------------------------------------------------------------
es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=6,
    batch_size=128,
    callbacks=[es]
)


# --------------------------------------------------------------
# 8. EVALUATE
# --------------------------------------------------------------
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

print("\nCLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


# --------------------------------------------------------------
# 9. SAVE MODEL (optional)
# --------------------------------------------------------------
model.save("therapeutic_class_bilstm.keras")

print("DONE — Therapeutic Class Prediction Model Built Successfully.")


Total rows: 253973
After NA removal: 228775
Example text_input:
0    augmentin 625 duo tablet | amoxycillin 500mg c...
1    azithral 500 tablet | azithromycin 500mg | 500...
2    ascoril ls syrup | ambroxol 30mg/5ml levosalbu...
3    allegra 120mg tablet | fexofenadine 120mg | 12...
4    avil 25 tablet | pheniramine 25mg | 25mg | tablet
Name: text_input, dtype: object
Number of classes: 11
155567 27453 45755


Epoch 1/6
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 324ms/step - accuracy: 0.9732 - loss: 0.0924 - val_accuracy: 0.9997 - val_loss: 8.0634e-04
Epoch 2/6
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 266ms/step - accuracy: 0.9998 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 6.3299e-05
Epoch 3/6
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 325ms/step - accuracy: 0.9997 - loss: 0.0012 - val_accuracy: 0.9999 - val_loss: 8.6772e-04
Epoch 4/6
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 334ms/step - accuracy: 0.9999 - loss: 4.7051e-04 - val_accuracy: 1.0000 - val_loss: 2.1803e-06
Epoch 5/6
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 366ms/step - accuracy: 1.0000 - loss: 1.5669e-05 - val_accuracy: 0.9997 - val_loss: 0.0011
Epoch 6/6
[1m1216/1216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 288ms/step - accuracy: 0.9999 - loss: 2.9691e-04 - val_accura

Contd.

In [7]:
# ==============================================================
# PROBLEM 2 — PRICE–DISCONTINUATION PREDICTION
# Multi-input Deep Learning: Text + Numerical + Categorical
# ==============================================================
df = pd.read_csv(DATA_PATH)

print("Total rows:", len(df))

use_cols = [
    "brand_name", "active_ingredients", "primary_strength", "dosage_form",
    "manufacturer", "price_inr", "pack_size", "is_discontinued"
]

df = df[use_cols].dropna().reset_index(drop=True)
print("Rows after NA:", len(df))


# --------------------------------------------------------------
# 2. TEXT PREPROCESSING
# --------------------------------------------------------------
def safe_json_parse(x):
    try:
        return json.loads(str(x).replace("'", '"'))
    except:
        return x

def extract_ingredients(txt):
    parsed = safe_json_parse(txt)
    if isinstance(parsed, list):
        out = []
        for item in parsed:
            if isinstance(item, dict):
                out.append(item.get("name","") + " " + item.get("strength",""))
        return " ".join(out)
    return str(parsed)

def clean(t):
    t = str(t).lower()
    t = re.sub(r'[^a-z0-9\s/.-]', ' ', t)
    return re.sub(r'\s+',' ',t).strip()

df["text"] = (
    df["brand_name"].astype(str).apply(clean) + " | " +
    df["active_ingredients"].apply(extract_ingredients).apply(clean) + " | " +
    df["primary_strength"].astype(str).apply(clean) + " | " +
    df["dosage_form"].astype(str).apply(clean)
)

print(df["text"].head())


# --------------------------------------------------------------
# 3. ENCODE CATEGORICAL FEATURES
# --------------------------------------------------------------
man_le = LabelEncoder()
dos_le = LabelEncoder()

df["man_id"] = man_le.fit_transform(df["manufacturer"])
df["dos_id"] = dos_le.fit_transform(df["dosage_form"])

# --------------------------------------------------------------
# 4. TARGET VARIABLE
# --------------------------------------------------------------
df["target"] = df["is_discontinued"].astype(int)

# --------------------------------------------------------------
# 5. TRAIN / TEST SPLIT
# --------------------------------------------------------------
X_text = df["text"].values
X_man = df["man_id"].values
X_dos = df["dos_id"].values
X_price = df["price_inr"].values
X_pack = df["pack_size"].values
y = df["target"].values

X_train, X_test, man_train, man_test, dos_train, dos_test, price_train, price_test, pack_train, pack_test, y_train, y_test = train_test_split(
    X_text, X_man, X_dos, X_price, X_pack, y, 
    test_size=0.2, random_state=42, stratify=y
)

# --------------------------------------------------------------
# 6. VECTORIZER FOR TEXT
# --------------------------------------------------------------
MAX_TOKENS = 20000
MAX_LEN = 60

vectorizer = layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_sequence_length=MAX_LEN,
    standardize=None
)
vectorizer.adapt(X_train)


# --------------------------------------------------------------
# 7. BUILD MULTI-INPUT MODEL
# --------------------------------------------------------------

# ----- TEXT INPUT -----
text_in = layers.Input(shape=(1,), dtype=tf.string, name="text")
x = vectorizer(text_in)
x = layers.Embedding(MAX_TOKENS, 128)(x)
x = layers.Bidirectional(layers.LSTM(128))(x)

# ----- MANUFACTURER (embedding) -----
man_in = layers.Input(shape=(), dtype=tf.int32, name="man")
man_emb = layers.Embedding(len(man_le.classes_), 32)(man_in)
man_emb = layers.Flatten()(man_emb)

# ----- DOSAGE FORM (embedding) -----
dos_in = layers.Input(shape=(), dtype=tf.int32, name="dos")
dos_emb = layers.Embedding(len(dos_le.classes_), 8)(dos_in)
dos_emb = layers.Flatten()(dos_emb)

# ----- NUMERICAL FEATURES -----
price_in = layers.Input(shape=(1,), name="price")
pack_in = layers.Input(shape=(1,), name="pack")

num_concat = layers.Concatenate()([price_in, pack_in])
num_dense = layers.Dense(32, activation="relu")(num_concat)

# ----- MERGE ALL -----
merged = layers.Concatenate()([x, man_emb, dos_emb, num_dense])
merged = layers.Dense(128, activation="relu")(merged)
merged = layers.Dropout(0.3)(merged)

out = layers.Dense(1, activation="sigmoid")(merged)

model = models.Model(
    inputs=[text_in, man_in, dos_in, price_in, pack_in],
    outputs=out
)

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()


# --------------------------------------------------------------
# 8. TRAIN
# --------------------------------------------------------------
es = tf.keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)

history = model.fit(
    {
        "text": X_train,
        "man": man_train,
        "dos": dos_train,
        "price": price_train,
        "pack": pack_train
    },
    y_train,
    validation_split=0.2,
    epochs=6,
    batch_size=128,
    callbacks=[es]
)


# --------------------------------------------------------------
# 9. EVALUATE
# --------------------------------------------------------------
y_pred = model.predict({
    "text": X_test,
    "man": man_test,
    "dos": dos_test,
    "price": price_test,
    "pack": pack_test
})

y_pred_labels = (y_pred > 0.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_labels))


# --------------------------------------------------------------
# 10. SAVE MODEL
# --------------------------------------------------------------
model.save("price_discontinuation_model.keras")

print("DONE — Price–Discontinuation Model Built Successfully.")


Total rows: 253973
Rows after NA: 208654
0    augmentin 625 duo tablet | amoxycillin 500mg c...
1    azithral 500 tablet | azithromycin 500mg | 500...
2    ascoril ls syrup | ambroxol 30mg/5ml levosalbu...
3    allegra 120mg tablet | fexofenadine 120mg | 12...
4    avil 25 tablet | pheniramine 25mg | 25mg | tablet
Name: text, dtype: object


Epoch 1/6
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m295s[0m 272ms/step - accuracy: 0.9639 - loss: 0.6712 - val_accuracy: 0.9715 - val_loss: 0.3471
Epoch 2/6
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 255ms/step - accuracy: 0.9692 - loss: 0.2331 - val_accuracy: 0.9719 - val_loss: 0.1746
Epoch 3/6
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 255ms/step - accuracy: 0.9719 - loss: 0.1346 - val_accuracy: 0.9733 - val_loss: 0.0838
Epoch 4/6
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 277ms/step - accuracy: 0.9736 - loss: 0.0720 - val_accuracy: 0.9729 - val_loss: 0.0873
Epoch 5/6
[1m1044/1044[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 288ms/step - accuracy: 0.9747 - loss: 0.0691 - val_accuracy: 0.9733 - val_loss: 0.0892
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 29ms/step
              precision    recall  f1-score   support

           0       0.97      1.