In [None]:
!pip install pandas numpy scikit-learn nltk
!pip install -q transformers sentence-transformers xgboost scipy



# New Section

In [None]:
import ast
import numpy as np
import pandas as pd

from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score

from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier




In [None]:
files.upload()

df = pd.read_csv("labeled_dialogues_groq.csv")

df["labels"] = df["labels"].apply(ast.literal_eval)
df["text"] = df["dialogue"].astype(str) + " " + df["clinical_note"].astype(str)

df["has_adverse_event"] = df["labels"].apply(
    lambda x: 0 if "no_adverse_event" in x else 1
)

df["labels_clean"] = df["labels"].apply(
    lambda x: [l for l in x if l != "no_adverse_event"]
)

Saving labeled_dialogues_groq.csv to labeled_dialogues_groq (1).csv


In [None]:
X = df["text"]

mlb = MultiLabelBinarizer()
y_multi = mlb.fit_transform(df["labels_clean"])
label_classes = mlb.classes_

X_train, X_test, y_bin_train, y_bin_test, y_mul_train, y_mul_test = train_test_split(
    X,
    df["has_adverse_event"],
    y_multi,
    test_size=0.2,
    random_state=42,
    stratify=df["has_adverse_event"]
)


In [None]:
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_train_bert = bert_model.encode(X_train.tolist(), convert_to_numpy=True)
X_test_bert = bert_model.encode(X_test.tolist(), convert_to_numpy=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
binary_model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=y_bin_train.value_counts()[0] / y_bin_train.value_counts()[1],
    eval_metric="logloss",
    random_state=42
)

binary_model.fit(X_train_tfidf, y_bin_train)

In [None]:
# Logistic Regression (TF-IDF)
lr_model = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight="balanced")
)
lr_model.fit(X_train_tfidf, y_mul_train)

# XGBoost (per label)
xgb_models = {}
for idx, label in enumerate(label_classes):
    model = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
    model.fit(X_train_tfidf, y_mul_train[:, idx])
    xgb_models[label] = model

# Logistic Regression (BERT)
lr_bert = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight="balanced")
)
lr_bert.fit(X_train_bert, y_mul_train)


In [None]:
def predict_categories_ensemble(text, weights=(0.2, 0.5, 0.3)):
    vec_tfidf = tfidf.transform([text])

    lr_probs = lr_model.predict_proba(vec_tfidf)[0]
    xgb_probs = np.array([
        xgb_models[label].predict_proba(vec_tfidf)[0][1]
        for label in label_classes
    ])

    bert_vec = bert_model.encode([text], convert_to_numpy=True)
    bert_probs = lr_bert.predict_proba(bert_vec)[0]

    final_probs = (
        weights[0]*lr_probs +
        weights[1]*xgb_probs +
        weights[2]*bert_probs
    )

    return dict(zip(label_classes, final_probs))


In [None]:
def optimize_thresholds(y_true, y_prob, labels):
    thresholds = {}
    for i, label in enumerate(labels):
        best_f1, best_t = 0, 0.5
        for t in np.arange(0.1, 0.91, 0.01):
            preds = (y_prob[:, i] >= t).astype(int)
            f1 = f1_score(y_true[:, i], preds)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        thresholds[label] = best_t
    return thresholds


y_val_probs = np.array([
    [predict_categories_ensemble(text)[l] for l in label_classes]
    for text in X_test
])

optimal_thresholds = optimize_thresholds(
    y_mul_test, y_val_probs, label_classes
)

In [None]:
HIGH_RISK = {"emergency", "allergic_reaction", "medication_error"}
MEDIUM_RISK = {"infection", "symptom_worsening", "side_effects"}

for label in label_classes:
    if label in HIGH_RISK:
        optimal_thresholds[label] = max(0.25, optimal_thresholds[label] - 0.15)
    elif label in MEDIUM_RISK:
        optimal_thresholds[label] = max(0.35, optimal_thresholds[label] - 0.10)
    else:
        optimal_thresholds[label] = min(0.65, optimal_thresholds[label] + 0.10)


In [None]:
def apply_thresholds_with_fallback(probs, threshold_map):
    active = [l for l, p in probs.items() if p >= threshold_map[l]]
    if not active:
        active = [max(probs, key=probs.get)]
    return active


In [None]:
def best_adverse_event_pipeline(text):
    vec = tfidf.transform([text])
    binary_prob = binary_model.predict_proba(vec)[0][1]

    if binary_prob < 0.2:
        return {
            "predicted_categories": ["no_adverse_event"],
            "risk_level": "LOW",
            "confidence": round(1 - binary_prob, 3)
        }

    probs = predict_categories_ensemble(text)
    active_labels = apply_thresholds_with_fallback(probs, optimal_thresholds)

    if any(l in HIGH_RISK for l in active_labels):
        risk = "HIGH"
    elif any(l in MEDIUM_RISK for l in active_labels):
        risk = "MEDIUM"
    else:
        risk = "LOW"

    return {
        "predicted_categories": active_labels,
        "risk_level": risk,
        "confidence": round(max(probs.values()), 3),
        "category_probabilities": probs
    }


In [None]:
def evaluate_ensemble(X_test, y_test):
    preds = []

    for text in X_test:
        vec = tfidf.transform([text])
        binary_prob = binary_model.predict_proba(vec)[0][1]

        if binary_prob < 0.2:
            preds.append([0]*len(label_classes))
        else:
            probs = predict_categories_ensemble(text)
            labels = apply_thresholds_with_fallback(probs, optimal_thresholds)
            preds.append([1 if l in labels else 0 for l in label_classes])

    preds = np.array(preds)

    print("ENSEMBLE MODEL RESULTS (FIXED)")
    print(classification_report(
        y_test,
        preds,
        target_names=label_classes,
        zero_division=0
    ))


evaluate_ensemble(X_test, y_mul_test)


ENSEMBLE MODEL RESULTS (FIXED)
                   precision    recall  f1-score   support

allergic_reaction       0.41      0.65      0.50        20
        emergency       0.31      0.53      0.39        19
        infection       0.49      0.68      0.57        38
 medication_error       0.29      0.71      0.41        28
   non_compliance       0.80      0.25      0.38        16
     side_effects       0.59      0.57      0.58        42
symptom_worsening       0.66      0.90      0.76        67

        micro avg       0.48      0.68      0.57       230
        macro avg       0.51      0.61      0.51       230
     weighted avg       0.53      0.68      0.57       230
      samples avg       0.19      0.21      0.19       230



In [None]:
import joblib
import os

SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)


In [None]:
joblib.dump(binary_model, f"{SAVE_DIR}/binary_model.pkl")
joblib.dump(lr_model, f"{SAVE_DIR}/lr_tfidf.pkl")
joblib.dump(lr_bert, f"{SAVE_DIR}/lr_clinical_bert.pkl")
joblib.dump(xgb_models, f"{SAVE_DIR}/xgb_models.pkl")


['saved_models/xgb_models.pkl']

In [None]:
joblib.dump(tfidf, f"{SAVE_DIR}/tfidf.pkl")
joblib.dump(mlb, f"{SAVE_DIR}/mlb.pkl")
joblib.dump(label_classes, f"{SAVE_DIR}/label_classes.pkl")
joblib.dump(optimal_thresholds, f"{SAVE_DIR}/optimal_thresholds.pkl")

['saved_models/optimal_thresholds.pkl']

In [None]:
bert_model.save(f"{SAVE_DIR}/bert_encoder")
