BIOBERT + SVM

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# 1️⃣ Load data
file_path = '/content/drive/My Drive/textual_data/TEXTUAL_DATA/augmented_disorder_symptoms_1000.xlsx'
df = pd.read_excel(file_path)

# 2️⃣ Encode target labels
le = LabelEncoder()
df['disorder_encoded'] = le.fit_transform(df['disorder'])

# 3️⃣ Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# 4️⃣ Generate BioBERT embeddings using Mean Pooling + Add frequency
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)
    mean_embedding = token_embeddings.mean(dim=0).numpy()
    return mean_embedding

print("⚡ Generating BioBERT embeddings with frequency...")

embeddings = []
for i, row in df.iterrows():
    symptom = str(row['symptom'])
    freq = float(row['frequency_num']) if not pd.isna(row['frequency_num']) else 0.0
    emb = get_embedding(symptom)
    combined = np.append(emb, freq)  # Append frequency as a feature
    embeddings.append(combined)

embeddings = np.array(embeddings)

# 5️⃣ Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    df['disorder_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=df['disorder_encoded']
)

# 6️⃣ Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7️⃣ Train SVM Classifier
print("⚡ Training SVM Classifier...")
svm = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
svm.fit(X_train_scaled, y_train)

# 8️⃣ Evaluate
y_pred = svm.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Final Accuracy (SVM): {accuracy:.4f}")

print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


BIOBERT + RF

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# 1️⃣ Load data
file_path = '/content/drive/My Drive/textual_data/TEXTUAL_DATA/augmented_disorder_symptoms_1000.xlsx'
df = pd.read_excel(file_path)

# 2️⃣ Encode target labels
le = LabelEncoder()
df['disorder_encoded'] = le.fit_transform(df['disorder'])

# 3️⃣ Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# 4️⃣ Generate BioBERT embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

print("⚡ Generating BioBERT embeddings (this might take a few mins)...")
embeddings = np.array([get_embedding(str(s)) for s in df['symptom']])

# 5️⃣ Train-test split
X_train, X_test, y_train, y_test = train_test_split(embeddings, df['disorder_encoded'], test_size=0.2, random_state=42)

# 6️⃣ Train Random Forest
print("⚡ Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 7️⃣ Evaluate
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Final Accuracy: {accuracy:.4f}")

print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
