In [19]:
#load tsv data

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os


drug_test_data = pd.read_csv("/content/drugLibTest_raw.tsv", sep='\t')
drug_train_data = pd.read_csv("/content/drugLibTrain_raw.tsv", sep='\t')

#combine train and test data
drug_data = pd.concat([drug_train_data, drug_test_data], ignore_index=True)

#display first few rows of the dataframe
print(drug_data.head())

   Unnamed: 0       urlDrugName  rating         effectiveness  \
0        2202         enalapril       4      Highly Effective   
1        3117  ortho-tri-cyclen       1      Highly Effective   
2        1146           ponstel      10      Highly Effective   
3        3947          prilosec       3  Marginally Effective   
4        1951            lyrica       2  Marginally Effective   

           sideEffects                               condition  \
0    Mild Side Effects  management of congestive heart failure   
1  Severe Side Effects                        birth prevention   
2      No Side Effects                        menstrual cramps   
3    Mild Side Effects                             acid reflux   
4  Severe Side Effects                            fibromyalgia   

                                      benefitsReview  \
0  slowed the progression of left ventricular dys...   
1  Although this type of birth control has more c...   
2  I was used to having cramps so badly that

In [20]:
#check for missing value counts
print(drug_data.isnull().sum())

Unnamed: 0            0
urlDrugName           0
rating                0
effectiveness         0
sideEffects           0
condition             1
benefitsReview       23
sideEffectsReview    98
commentsReview       13
dtype: int64


In [21]:
#drop rows with missing values
drug_data = drug_data.dropna()

In [22]:
#create csv file so I can view it clearly
output_path = '/content/cleaned_drug_data.csv'

# Save the DataFrame to a CSV file
drug_data.to_csv(output_path, index=False)

# Now read the CSV file
pd.read_csv(output_path)

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above
...,...,...,...,...,...,...,...,...,...
4008,690,accutane,7,Considerably Effective,Severe Side Effects,acne vulgaris,Detoxing effect by pushing out the system thro...,"Hairloss, extreme dry skin, itchiness, raises ...",Treatment period is 3 months/12 weeks. Dosage ...
4009,1071,proair-hfa,10,Highly Effective,No Side Effects,asthma,"The albuterol relieved the constriction, irrit...",I have experienced no side effects.,I use the albuterol as needed because of aller...
4010,681,accutane,8,Considerably Effective,Moderate Side Effects,serve acne,Serve Acne has turned to middle,"Painfull muscles, problems with seeing at night","This drug is highly teratogenic ,females must ..."
4011,2709,divigel,10,Highly Effective,No Side Effects,menopause,"My overall mood, sense of well being, energy l...",No side effects of any kind were noted or appa...,Divigel is a topically applied Bio-Identical H...


In [23]:
#combine comments and reviews of drug effective, side effects, and other comments into one column

#Drop rows with missing or unnamed drug names
drug_data = drug_data.dropna(subset=["urlDrugName", "rating"])
drug_data = drug_data[~drug_data["urlDrugName"].str.lower().str.contains("unnamed", na=False)]

#Combine three text fields into one review column
drug_data["text"] = (
    drug_data[["benefitsReview", "sideEffectsReview", "commentsReview"]]
    .fillna("")
    .agg(" ".join, axis=1)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

#Drop rows where the combined text is empty
drug_data = drug_data[drug_data["text"].str.len() > 10]

#Drop rows where the combined text is empty
drug_data = drug_data[drug_data["text"].str.len() > 10]

#Keep only what we need
drug_data = drug_data[["urlDrugName", "rating", "effectiveness", "sideEffects", "condition", "text"]]


In [24]:
#describe data
import pandas as pd
drug_data = pd.concat([pd.read_csv("drugLibTrain_raw.tsv", sep="\t"), pd.read_csv("drugLibTest_raw.tsv", sep="\t")])
drug_data["text"] = drug_data[["benefitsReview","sideEffectsReview","commentsReview"]].fillna("").agg(" ".join, axis=1).str.strip()
print(f"Final dataset size after cleaning: {df[df['text'].str.len()>10].dropna(subset=['urlDrugName','rating']).shape[0]:,} reviews")



Final dataset size after cleaning: 4,142 reviews


In [25]:
#map out sentiment based on rating

def map_sentiment(r):
    if r <= 3:
        return "negative"
    elif r <= 6:
        return "neutral"
    else:
        return "positive"

drug_data["sentiment"] = drug_data["rating"].apply(map_sentiment)

#view distribution of sentiment

sentiment_dist = drug_data["sentiment"].value_counts(normalize=True).round(3)
print(sentiment_dist)

sentiment
positive    0.676
negative    0.181
neutral     0.143
Name: proportion, dtype: float64


In [26]:
#split x and y into train and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    drug_data["text"], drug_data["sentiment"],
    test_size=0.2, random_state=42, stratify=drug_data["sentiment"]
)


In [27]:
#Step 4 — Baselines
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

pipelines = {
    "LinearSVC": Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)),
        ("clf", LinearSVC())
    ]),
    "LogReg": Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)),
        ("clf", LogisticRegression(max_iter=1000, n_jobs=-1))
    ])
}

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1m = f1_score(y_test, preds, average="macro")
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.3f} | Macro-F1: {f1m:.3f}")
    print(classification_report(y_test, preds, digits=3))
    print(confusion_matrix(y_test, preds))


=== LinearSVC ===
Accuracy: 0.767 | Macro-F1: 0.558
              precision    recall  f1-score   support

    negative      0.777     0.533     0.632       150
     neutral      0.520     0.109     0.181       119
    positive      0.775     0.970     0.861       560

    accuracy                          0.767       829
   macro avg      0.690     0.537     0.558       829
weighted avg      0.738     0.767     0.722       829

[[ 80   6  64]
 [ 12  13  94]
 [ 11   6 543]]

=== LogReg ===
Accuracy: 0.727 | Macro-F1: 0.428
              precision    recall  f1-score   support

    negative      0.955     0.280     0.433       150
     neutral      1.000     0.008     0.017       119
    positive      0.714     1.000     0.833       560

    accuracy                          0.727       829
   macro avg      0.890     0.429     0.428       829
weighted avg      0.799     0.727     0.644       829

[[ 42   0 108]
 [  2   1 116]
 [  0   0 560]]


In [28]:
%pip install evaluate transformers



In [29]:
import torch

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate, json

print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU only")

GPU: CPU only


In [30]:
print(drug_data.columns)


Index(['Unnamed: 0', 'urlDrugName', 'rating', 'effectiveness', 'sideEffects',
       'condition', 'benefitsReview', 'sideEffectsReview', 'commentsReview',
       'text', 'sentiment'],
      dtype='object')


In [31]:
TEXT_COL = "text"
LABEL_COL = "sentiment"

#make sure labels are exactly as expected
import re
import numpy as np
import pandas as pd
from datasets import Dataset

drug_data = pd.read_csv("/content/cleaned_drug_data.csv")

#chec TEXT_COL exists
if TEXT_COL not in drug_data.columns:
    preferred = ["benefitsReview", "sideEffectsReview", "commentsReview"]
    present = [c for c in preferred if c in drug_data.columns]
    if not present:
        pat = re.compile(r"(benefit|side\s*effect|comment|review|text)", re.IGNORECASE)
        present = [c for c in drug_data.columns if pat.search(str(c))]
    if not present:
        raise KeyError("No text/review columns found to build 'text'.")
    drug_data[TEXT_COL] = (
        drug_data[present].fillna("").astype(str).agg(" ".join, axis=1)
        .str.replace(r"\s+", " ", regex=True).str.strip()
    )

#Check LABEL_COL exists (derive from rating)
if LABEL_COL not in drug_data.columns:
    rating_col = next((c for c in ["rating","Rating","RATING"] if c in drug_data.columns), None)
    if rating_col is None:
        raise KeyError("No 'sentiment' column and no 'rating' column to derive it from.")
    def map_sent(r):
        try:
            r = float(r)
        except Exception:
            return np.nan
        if r <= 3: return "negative"
        if r <= 6: return "neutral"
        return "positive"
    drug_data[LABEL_COL] = drug_data[rating_col].apply(map_sent)

#Clean text/labels
drug_data = drug_data.dropna(subset=[TEXT_COL, LABEL_COL])
drug_data[TEXT_COL] = drug_data[TEXT_COL].astype(str).str.strip()
drug_data = drug_data[drug_data[TEXT_COL].str.len() > 10]

#Map labels -> ids
label_order = ["negative","neutral","positive"]
label2id = {l:i for i,l in enumerate(label_order)}
id2label = {i:l for l,i in label2id.items()}
drug_data["label"] = drug_data[LABEL_COL].map(label2id)

#Safety check
bad = drug_data[drug_data["label"].isna()][LABEL_COL].unique()
if len(bad):
    raise ValueError(f"Unexpected labels found: {bad}. Expected one of {label_order}.")

# Train/val split
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    drug_data[[TEXT_COL,"label"]],
    test_size=0.2,
    random_state=42,
    stratify=drug_data["label"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
len(train_ds), len(val_ds), drug_data[LABEL_COL].value_counts()


(3210,
 803,
 sentiment
 positive    2707
 negative     721
 neutral      585
 Name: count, dtype: int64)

In [32]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
MAX_LEN = 256

tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize(batch):
    #Include the labels in the tokenized output
    tokenized_inputs = tok(batch["text"], truncation=True, max_length=MAX_LEN)
    tokenized_inputs["labels"] = batch["label"] #Add the labels
    return tokenized_inputs

train_tok = train_ds.map(tokenize, batched=True)
val_tok   = val_ds.map(tokenize,   batched=True)

collator = DataCollatorWithPadding(tokenizer=tok)

num_labels = 3
id2label = {0:"negative", 1:"neutral", 2:"positive"}
label2id = {"negative":0, "neutral":1, "positive":2}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/3210 [00:00<?, ? examples/s]

Map:   0%|          | 0/803 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import evaluate
from transformers import TrainingArguments, Trainer

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

args = TrainingArguments(
    output_dir="bert-drug-sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=16,   #lower to 8 if you hit OOM
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch", #Changed from evaluation_strategy
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics
)

train_result = trainer.train()
eval_result  = trainer.evaluate()
eval_result

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

pred = trainer.predict(val_tok)
y_pred = pred.predictions.argmax(axis=1)
y_true = np.array(val_df["label"])

print(classification_report(y_true, y_pred, target_names=["negative","neutral","positive"], digits=3))

cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix - ClinicalBERT")
plt.xlabel("Predicted"); plt.ylabel("True")
plt.xticks([0,1,2], ["negative","neutral","positive"])
plt.yticks([0,1,2], ["negative","neutral","positive"])
for (i,j), z in np.ndenumerate(cm):
    plt.text(j, i, str(z), ha="center", va="center")
plt.tight_layout()
plt.show()


In [None]:
SAVE_DIR = "bert-drug-sentiment-best"
trainer.save_model(SAVE_DIR)
tok.save_pretrained(SAVE_DIR)

from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=trainer.model, tokenizer=tok, return_all_scores=False, truncation=True)

def predict_sentiment(texts):
    if isinstance(texts, str): texts = [texts]
    return [o["label"] for o in pipe(texts)]

#sanity check
predict_sentiment([
    "Worked great with no side effects.",
    "Did not help and made me nauseous."
])

In [None]:
#Final evaluation & comparison
#confusion matrices by percentages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, f1_score, recall_score
)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

#Helper functions

def to_names(int_labels):
    return [id2label[int(i)] for i in int_labels]

def to_ints(name_labels):
    return [label2id[str(n)] for n in name_labels]

def metrics_summary(y_true_int, y_pred_int, model_name):
    acc  = accuracy_score(y_true_int, y_pred_int)
    f1m  = f1_score(y_true_int, y_pred_int, average="macro")
    recm = recall_score(y_true_int, y_pred_int, average="macro")
    return {"model": model_name, "accuracy": acc, "macro_recall": recm, "macro_f1": f1m}

def confusion_percent(cm_counts):
    """Return row-normalized confusion matrix in percentages."""
    with np.errstate(invalid="ignore", divide="ignore"):
        row_sums = cm_counts.sum(axis=1, keepdims=True)
        return np.where(row_sums > 0, (cm_counts / row_sums) * 100.0, 0.0)

def print_tn_fp_fn_tp(y_true, y_pred, labels):
    """Print TN, FP, FN, TP per class (percentages)."""
    total = len(y_true)
    for i, lab in enumerate(labels):
        y_true_bin = (y_true == i).astype(int)
        y_pred_bin = (y_pred == i).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin).ravel()
        tn_p, fp_p, fn_p, tp_p = [x / total * 100 for x in [tn, fp, fn, tp]]
        print(f"\nClass '{label_order[i]}' ({i}):")
        print(f"  True Negatives:  {tn_p:6.2f}%")
        print(f"  False Positives: {fp_p:6.2f}%")
        print(f"  False Negatives: {fn_p:6.2f}%")
        print(f"  True Positives:  {tp_p:6.2f}%")


#1)Prepare ground truth

X_train = train_df[TEXT_COL].astype(str).tolist()
y_train_int = train_df["label"].astype(int).to_numpy()

X_val = val_df[TEXT_COL].astype(str).tolist()
y_val_int = val_df["label"].astype(int).to_numpy()
y_val_names = to_names(y_val_int)


#2)ClinicalBERT predictions

bert_logits = trainer.predict(val_tok).predictions
bert_pred_int = bert_logits.argmax(axis=1)
bert_metrics = metrics_summary(y_val_int, bert_pred_int, "ClinicalBERT")
cm_bert = confusion_matrix(y_val_int, bert_pred_int, labels=[0,1,2])

print("\n=== ClinicalBERT: Classification Report ===")
print(classification_report(y_val_int, bert_pred_int, target_names=label_order, digits=3))
print_tn_fp_fn_tp(y_val_int, bert_pred_int, labels=[0,1,2])


#3)Classical baselines

pipelines = {
    "TFIDF+LinearSVC": Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)),
        ("clf", LinearSVC())
    ]),
    "TFIDF+LogReg": Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)),
        ("clf", LogisticRegression(max_iter=1000, n_jobs=-1))
    ])
}

y_train_names = to_names(y_train_int)
y_val_names   = to_names(y_val_int)
classical_rows = []
cms = {"ClinicalBERT": cm_bert}

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train_names)
    preds_names = pipe.predict(X_val)
    preds_int   = np.array(to_ints(preds_names))
    cm = confusion_matrix(y_val_int, preds_int, labels=[0,1,2])
    cms[name] = cm

    print(f"\n=== {name}: Classification Report ===")
    print(classification_report(y_val_names, preds_names, target_names=label_order, digits=3))
    print_tn_fp_fn_tp(y_val_int, preds_int, labels=[0,1,2])

    row = metrics_summary(y_val_int, preds_int, name)
    classical_rows.append(row)


#4)Confusion Matrices Side by Side

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, cm) in zip(axes, cms.items()):
    cm_pct = confusion_percent(cm)
    im = ax.imshow(cm_pct, cmap="Blues")
    ax.set_title(f"{name}", fontsize=11)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks(range(len(label_order)))
    ax.set_yticks(range(len(label_order)))
    ax.set_xticklabels(label_order)
    ax.set_yticklabels(label_order)
    for (i, j), val in np.ndenumerate(cm_pct):
        ax.text(j, i, f"{val:.1f}%", ha="center", va="center", fontsize=9)
fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="Row %")
fig.suptitle("Confusion Matrices (Percent) — ClinicalBERT vs Classical Models", fontsize=13)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


#5)Comparison Table
rows = [bert_metrics] + classical_rows
perf_df = pd.DataFrame(rows).sort_values("macro_f1", ascending=False).reset_index(drop=True)
display(perf_df.style.format({"accuracy": "{:.3f}", "macro_recall": "{:.3f}", "macro_f1": "{:.3f}"}))
perf_df.to_csv("model_performance_comparison.csv", index=False)
print("\nSaved: model_performance_comparison.csv")


#6)Interpretation Summary
best_row = perf_df.iloc[0]
print("\nInterpretation Notes:")
print(f"- Best macro-F1: {best_row['model']} "
      f"(macro-F1={best_row['macro_f1']:.3f}, "
      f"accuracy={best_row['accuracy']:.3f}, "
      f"macro-recall={best_row['macro_recall']:.3f}).")
print("- Confusion matrices above are row-normalized (% of true labels).")
print("- TN/FP/FN/TP values indicate per-class tradeoffs.")
print("- Neutral class usually has higher confusion due to mixed wording.")
print("- Class-weighted fine-tuning or collecting more neutral data can improve performance.")
