BIO-GPT + XGBoost

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

# 1️⃣ Load dataset
file_path = '/content/drive/My Drive/textual_data/TEXTUAL_DATA/augmented_disorder_symptoms_1000.xlsx'
df = pd.read_excel(file_path)

# 2️⃣ Encode target labels
le = LabelEncoder()
df['disorder_encoded'] = le.fit_transform(df['disorder'])

# 3️⃣ Load BioGPT
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = AutoModel.from_pretrained("microsoft/biogpt")

# 4️⃣ Generate embeddings using mean pooling + add frequency
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)
    mean_embedding = token_embeddings.mean(dim=0).numpy()
    return mean_embedding

print("⚡ Generating BioGPT embeddings with frequency...")

embeddings = []
for i, row in df.iterrows():
    symptom = str(row['symptom'])
    freq = float(row['frequency_num']) if not pd.isna(row['frequency_num']) else 0.0
    emb = get_embedding(symptom)
    combined = np.append(emb, freq)
    embeddings.append(combined)

X = np.array(embeddings)
y = df['disorder_encoded'].values

# 5️⃣ Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 6️⃣ Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7️⃣ Train XGBoost
print("⚡ Training XGBoost...")
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb.fit(X_train_scaled, y_train)

# 8️⃣ Evaluate
y_pred = xgb.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Final Accuracy (BioGPT + XGBoost): {accuracy:.4f}")

print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


BIOGPT + SVM

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# 1️⃣ Load data
file_path = '/content/drive/My Drive/textual_data/TEXTUAL_DATA/augmented_disorder_symptoms_1000.xlsx'
df = pd.read_excel(file_path)

# 2️⃣ Encode target labels
le = LabelEncoder()
df['disorder_encoded'] = le.fit_transform(df['disorder'])

# 3️⃣ Load BioGPT
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = AutoModel.from_pretrained("microsoft/biogpt")

# 4️⃣ Generate BioGPT embeddings (mean pooling) + Add frequency
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # (seq_len, hidden_dim)
    mean_embedding = token_embeddings.mean(dim=0).numpy()    # (hidden_dim,)
    return mean_embedding

print("⚡ Generating BioGPT embeddings with frequency...")

embeddings = []
for i, row in df.iterrows():
    symptom = str(row['symptom'])
    freq = float(row['frequency_num']) if not pd.isna(row['frequency_num']) else 0.0
    emb = get_embedding(symptom)
    combined = np.append(emb, freq)  # Append frequency as feature
    embeddings.append(combined)

embeddings = np.array(embeddings)

# 5️⃣ Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    df['disorder_encoded'],
    test_size=0.2,
    random_state=42,
    stratify=df['disorder_encoded']
)

# 6️⃣ Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7️⃣ Train SVM
print("⚡ Training SVM (RBF kernel)...")
svm = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
svm.fit(X_train_scaled, y_train)

# 8️⃣ Evaluate
y_pred = svm.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Final Accuracy (BioGPT + SVM): {accuracy:.4f}")

print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
