In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
df = pd.read_csv("random_augmented_balanced_dataset.csv")[['text', 'intent']].dropna()

# 80/10/10 split
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['intent'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['intent'], random_state=42)

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['intent'])
y_val = label_encoder.transform(val_df['intent'])
y_test = label_encoder.transform(test_df['intent'])

# BoW Vectorizer
bow_vectorizer = CountVectorizer(max_features=6000, stop_words='english', binary=True)
X_bow_train = bow_vectorizer.fit_transform(train_df['text']).toarray()
X_bow_val = bow_vectorizer.transform(val_df['text']).toarray()
X_bow_test = bow_vectorizer.transform(test_df['text']).toarray()

# BioBERT setup
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name, trust_remote_code=True, use_safetensors=True).to(device)
bert_model.eval()

def get_cls_embeddings(texts, tokenizer, model, device, max_len=192):
    embeddings = []
    for text in tqdm(texts, desc="BERT Embeddings"):
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=max_len, return_tensors="pt")
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embedding = output.last_hidden_state[:, 0, :]
            embeddings.append(cls_embedding.cpu().numpy()[0])
    return np.array(embeddings)

# Get BioBERT embeddings
X_bert_train = get_cls_embeddings(train_df['text'].tolist(), tokenizer, bert_model, device)
X_bert_val = get_cls_embeddings(val_df['text'].tolist(), tokenizer, bert_model, device)
X_bert_test = get_cls_embeddings(test_df['text'].tolist(), tokenizer, bert_model, device)

# Concatenate BoW + BioBERT embeddings
X_train = np.concatenate([X_bert_train, X_bow_train], axis=1)
X_val = np.concatenate([X_bert_val, X_bow_val], axis=1)
X_test = np.concatenate([X_bert_test, X_bow_test], axis=1)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Define ML models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=300, max_depth=30, class_weight='balanced_subsample', n_jobs=-1),
    "SVM (Linear)": SVC(kernel='linear', C=1),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(max_depth=30),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200),
    "Naive Bayes": MultinomialNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "LightGBM": LGBMClassifier()
}

# Train and evaluate all models
for name, model in models.items():
    try:
        print(f"\n===== {name} =====")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"Test Accuracy: {acc:.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    except Exception as e:
        print(f"⚠️ Error with {name}: {e}")


  from .autonotebook import tqdm as notebook_tqdm
BERT Embeddings: 100%|██████████| 12059/12059 [01:08<00:00, 174.77it/s]
BERT Embeddings: 100%|██████████| 1507/1507 [00:08<00:00, 170.21it/s]
BERT Embeddings: 100%|██████████| 1508/1508 [00:08<00:00, 169.38it/s]



===== Logistic Regression =====
Test Accuracy: 0.6711
Classification Report:
                        precision    recall  f1-score   support

    applicable disease       0.83      0.80      0.81       171
              ask more       0.76      0.72      0.74       153
                 cause       0.59      0.55      0.57       136
            definition       0.80      0.79      0.79       158
disease manifestations       0.57      0.58      0.57       137
                method       0.71      0.69      0.70       156
              symptoms       0.57      0.58      0.58       131
             treatment       0.41      0.45      0.43       150
      treatment method       0.57      0.64      0.60       157
                 usage       0.91      0.84      0.88       159

              accuracy                           0.67      1508
             macro avg       0.67      0.66      0.67      1508
          weighted avg       0.68      0.67      0.67      1508


===== Random Forest ==

KeyboardInterrupt: 