In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import ast

from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load data
df = pd.read_csv('data/faers_adalimumab_2020-2024_ungrouped_cleaned_2.csv', low_memory=False)

In [3]:
data = df[['pt', 'SOC']].copy()
data = data.drop_duplicates(subset=['pt'])

# Split the data into training and test sets
data_bigtrain = data[data['SOC'].notna()].copy()
data_test = data[data['SOC'].isna()].copy()

len(data_bigtrain), len(data_test)

(7880, 1083)

In [4]:
train_df, val_df = train_test_split(data_bigtrain, test_size=0.2, stratify=data_bigtrain['SOC'], random_state=42)

In [5]:
tokenizer = AutoTokenizer.from_pretrained('biobert-finetuned')
model = AutoModel.from_pretrained('biobert-finetuned')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0, 0].numpy()

tqdm.pandas()
for df_ in [train_df, val_df, data_test]:
    df_['embedding'] = df_['pt'].progress_apply(get_embedding)

100%|██████████| 6304/6304 [03:49<00:00, 27.43it/s]
100%|██████████| 1576/1576 [00:57<00:00, 27.57it/s]
100%|██████████| 1083/1083 [00:39<00:00, 27.19it/s]


In [7]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['SOC'])
y_val = label_encoder.transform(val_df['SOC'])
class_names = list(label_encoder.classes_)

X_train = np.stack(train_df['embedding'].values)
X_val = np.stack(val_df['embedding'].values)
X_test = np.stack(data_test['embedding'].values)

In [None]:
best_params_df = pd.read_csv('data/ML_BioBERT_finetuned_best_model_parameters.csv')

best_params = {}
for _, row in best_params_df.iterrows():
    model_name = row['Model']
    params = ast.literal_eval(row['Best Parameters']) 
    best_params[model_name] = params
    best_accuracy = row['Best Accuracy']

# Define the models with their best parameters
lr = LogisticRegression(**best_params['LR'], random_state=42)
svm = SVC(**best_params['SVM'], probability=True, random_state=42)
knn = KNeighborsClassifier(**best_params['KNN'], random_state=42)
rf = RandomForestClassifier(**best_params['RF'], random_state=42)

## Voting Ensemble

In [None]:
voting_ensemble = VotingClassifier(
    estimators=[('lr', lr), ('svm', svm), ('knn', knn), ('rf', rf)],
    voting='soft',
    weights=[1, 1, 1, 1],
    n_jobs=-1
)

voting_ensemble.fit(X_train, y_train)
y_val_pred_voting = voting_ensemble.predict(X_val)
voting_accuracy = accuracy_score(y_val, y_val_pred_voting)
print('Voting Ensemble Results:')
print(classification_report(y_val, y_val_pred_voting, target_names=class_names))
print(f'Voting Ensemble Accuracy: {voting_accuracy:.4f}')

Voting Ensemble Results:
                                                                     precision    recall  f1-score   support

                               Blood and lymphatic system disorders       0.75      0.67      0.71         9
                                                  Cardiac disorders       0.90      0.97      0.93        36
                         Congenital, familial and genetic disorders       0.83      0.71      0.77        14
                                        Ear and labyrinth disorders       1.00      0.89      0.94         9
                                                Endocrine disorders       1.00      0.80      0.89        10
                                                      Eye disorders       0.90      0.95      0.92        57
                                         Gastrointestinal disorders       0.96      0.95      0.95       113
               General disorders and administration site conditions       0.91      0.84      0.88    

In [21]:
final_model = voting_ensemble 
y_test_pred = final_model.predict(X_test)
data_test['SOC_ensemble_biobert'] = label_encoder.inverse_transform(y_test_pred)

In [None]:
# Save predictions to CSV
data_test[['pt', 'SOC_ensemble_biobert']].to_csv('data/4.4 BioBERT_finetuned_Ensemble_prediction.csv', index=False)