In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import re

from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('data/faers_adalimumab_2020-2024_ungrouped_cleaned_2.csv', low_memory=False)

In [None]:
data = df[['pt', 'SOC']].copy()
data = data.drop_duplicates(subset=['pt'])

# Split the data into training and test sets
data_bigtrain = data[data['SOC'].notna()].copy()
data_test = data[data['SOC'].isna()].copy()

len(data_bigtrain), len(data_test)

In [None]:
train_df, val_df = train_test_split(data_bigtrain, test_size=0.2, stratify=data_bigtrain['SOC'], random_state=42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model.eval()



In [None]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0, 0].numpy()

tqdm.pandas()
for df_ in [train_df, val_df, data_test]:
    df_['embedding'] = df_['pt'].progress_apply(get_embedding)

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['SOC'])
y_val = label_encoder.transform(val_df['SOC'])
class_names = list(label_encoder.classes_)

X_train = np.stack(train_df['embedding'].values)
X_val = np.stack(val_df['embedding'].values)
X_test = np.stack(data_test['embedding'].values)

In [None]:
svm = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, class_weight='balanced'))
])

rf = RandomForestClassifier(n_estimators=200, max_depth=20, class_weight='balanced', random_state=42)

mlp = Pipeline([
    ("scaler", StandardScaler()),
    ("mlp", MLPClassifier(hidden_layer_sizes=(256, 128, 64), alpha=1e-4,
                          learning_rate='adaptive', max_iter=500,
                          early_stopping=True, validation_fraction=0.1, random_state=42))
])

knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=27))
])

## Voting Ensemble

In [None]:
voting_ensemble = VotingClassifier(
    estimators=[("svm", svm), ("rf", rf), ("mlp", mlp), ("knn", knn)],
    voting='soft',
    weights=[1, 2, 2, 1],
    n_jobs=-1
)

voting_ensemble.fit(X_train, y_train)
y_val_pred_voting = voting_ensemble.predict(X_val)
print("📊 Voting Ensemble Results:")
print(classification_report(y_val, y_val_pred_voting, target_names=class_names))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Stacking Ensemble

In [None]:
stacking_ensemble = StackingClassifier(
    estimators=[("svm", svm), ("rf", rf), ("mlp", mlp), ("knn", knn)],
    final_estimator=LogisticRegression(max_iter=500, class_weight='balanced'),
    passthrough=True,
    n_jobs=-1
)

stacking_ensemble.fit(X_train, y_train)
y_val_pred_stack = stacking_ensemble.predict(X_val)
print("📊 Stacking Ensemble Results:")
print(classification_report(y_val, y_val_pred_stack, target_names=class_names))

In [None]:
final_model = stacking_ensemble  # or voting_ensemble
y_test_pred = final_model.predict(X_test)
data_test['SOC'] = label_encoder.inverse_transform(y_test_pred)