In [None]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import AutoTokenizer, AutoModel

# 2. Load and Encode Data
df = pd.read_csv("random_augmented_balanced_dataset.csv")

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['intent'])

# 80/10/10 split
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# 3. BioBERT Embeddings (GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "dmis-lab/biobert-base-cased-v1.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name, trust_remote_code=True, use_safetensors=True).to(device)
bert_model.eval()

def get_bert_embeddings(texts, max_len=192):
    all_embeddings = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=max_len)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = bert_model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            all_embeddings.append(cls_embedding)
    return np.array(all_embeddings)

X_train_bert = get_bert_embeddings(train_df['text'].tolist())
X_val_bert   = get_bert_embeddings(val_df['text'].tolist())
X_test_bert  = get_bert_embeddings(test_df['text'].tolist())

# 4. Bag-of-Words (CPU)
bow_vectorizer = CountVectorizer(max_features=3000, binary=True)
X_train_bow = bow_vectorizer.fit_transform(train_df['text']).toarray()
X_val_bow   = bow_vectorizer.transform(val_df['text']).toarray()
X_test_bow  = bow_vectorizer.transform(test_df['text']).toarray()

# 5. Concatenate BERT + BoW
X_train_combined = np.concatenate([X_train_bert, X_train_bow], axis=1)
X_val_combined   = np.concatenate([X_val_bert, X_val_bow], axis=1)
X_test_combined  = np.concatenate([X_test_bert, X_test_bow], axis=1)

y_train = train_df['label'].values
y_val   = val_df['label'].values
y_test  = test_df['label'].values

# 6. Soft Voting Ensemble Classifier
clf1 = LogisticRegression(max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=100)
clf4 = SVC(probability=True)  # needed for soft voting

ensemble = VotingClassifier(estimators=[
    ('lr', clf1),
    ('rf', clf2),
    ('svc', clf4)
], voting='soft')

ensemble.fit(X_train_combined, y_train)

# 7. Evaluate
y_pred = ensemble.predict(X_test_combined)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Test Accuracy: 0.6876657824933687

Classification Report:

                        precision    recall  f1-score   support

    applicable disease       0.76      0.80      0.78       171
              ask more       0.72      0.76      0.74       153
                 cause       0.65      0.65      0.65       136
            definition       0.71      0.81      0.76       158
disease manifestations       0.61      0.56      0.59       137
                method       0.75      0.68      0.71       156
              symptoms       0.61      0.60      0.60       131
             treatment       0.56      0.45      0.50       150
      treatment method       0.62      0.66      0.64       157
                 usage       0.81      0.85      0.83       159

              accuracy                           0.69      1508
             macro avg       0.68      0.68      0.68      1508
          weighted avg       0.68      0.69      0.68      1508

