In [None]:
import pandas as pd
import numpy as np
import torch
import re
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from scipy.stats import mode

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/faers_adalimumab_2020-2024_ungrouped_cleaned_2.csv', low_memory=False)

In [None]:
data = df[['pt', 'SOC']].copy()
data = data.drop_duplicates(subset=['pt'])

# Split the data into training and test sets
data_bigtrain = data[data['SOC'].notna()].copy()
data_test = data[data['SOC'].isna()].copy()
data_test['missing_SOC'] = True

len(data_bigtrain), len(data_test)

(7880, 1083)

In [4]:
train_df, val_df = train_test_split(data_bigtrain, test_size=0.2, stratify=data_bigtrain['SOC'], random_state=42)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0, 0].numpy()

tqdm.pandas()
for df_ in [train_df, val_df, data_test]:
    df_['embedding'] = df_['pt'].progress_apply(get_embedding)

100%|██████████| 6304/6304 [04:43<00:00, 22.22it/s]
100%|██████████| 1576/1576 [01:09<00:00, 22.55it/s]
100%|██████████| 1083/1083 [00:46<00:00, 23.08it/s]


In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['SOC'])
y_val = label_encoder.transform(val_df['SOC'])

X_train = np.stack(train_df['embedding'].values)
X_val = np.stack(val_df['embedding'].values)
X_test = np.stack(data_test['embedding'].values)

In [8]:
# Prepare for Self-Training
X_all = np.vstack([X_train, X_test])
y_semi = np.concatenate([y_train, [-1] * len(X_test)])

In [9]:
svm = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, class_weight='balanced'))
])

rf = RandomForestClassifier(n_estimators=200, max_depth=20, class_weight='balanced', random_state=42)

mlp = Pipeline([
    ("scaler", StandardScaler()),
    ("mlp", MLPClassifier(hidden_layer_sizes=(256, 128, 64), alpha=1e-4,
                          learning_rate='adaptive', max_iter=500,
                          early_stopping=True, validation_fraction=0.1, random_state=42))
])

knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=27))
])

In [None]:
# Train self-training classifiers
self_train_models = {}
for name, base_model in zip(["svm", "rf", "mlp", "knn"], [svm, rf, mlp, knn]):
    print(f"Training self-training model: {name}")
    clf = SelfTrainingClassifier(base_model, criterion='k_best', k_best=500)
    clf.fit(X_all, y_semi)
    self_train_models[name] = clf


Training self-training model: mlp

Training self-training model: svm

Training self-training model: rf

Training self-training model: knn


In [None]:
# Pseudo-label the test set
pseudo_labels = {
    name: model.predict(X_test)
    for name, model in self_train_models.items()
}

pseudo_label_array = np.column_stack(list(pseudo_labels.values()))
final_pseudo_labels, _ = mode(pseudo_label_array, axis=1)
final_pseudo_labels = final_pseudo_labels.flatten()

In [None]:
# Combine the training and pseudo-labeled data
X_combined = np.vstack([X_train, X_test])
y_combined = np.concatenate([y_train, final_pseudo_labels])

## Voting Ensemble

In [None]:
voting_ensemble = VotingClassifier(
    estimators=[
        ("svm", self_train_models["svm"].base_estimator),
        ("rf", self_train_models["rf"].base_estimator),
        ("mlp", self_train_models["mlp"].base_estimator),
        ("knn", self_train_models["knn"].base_estimator)
    ],
    voting='soft',
    weights=[2, 1, 2, 1],
    n_jobs=-1
)

voting_ensemble.fit(X_combined, y_combined)

In [None]:
# Evaluate on validation set
y_val_pred = voting_ensemble.predict(X_val)

all_labels = np.unique(np.concatenate([y_val, y_val_pred]))
all_class_names = label_encoder.inverse_transform(all_labels)

print("Voting Ensemble Results:")
print(classification_report(y_val, y_val_pred, labels=all_labels, target_names=all_class_names))

In [None]:
# Predict on the test set
y_test_final = voting_ensemble.predict(X_test)
data_test['SOC'] = label_encoder.inverse_transform(y_test_final)