In [1]:
import pandas as pd
import torch
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
df = pd.read_csv('data/faers_adalimumab_2020-2024_ungrouped_cleaned_2.csv', low_memory=False)

In [3]:
data = df[['pt', 'SOC']].copy()
data = data.drop_duplicates(subset=['pt', 'SOC'])

In [4]:
def clean_text(text):
    text = str(text).lower()                    # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)     # Remove punctuation/special chars
    text = re.sub(r'\s+', ' ', text).strip()    # Normalize whitespace
    return text

data['pt_cleaned'] = data['pt'].apply(clean_text)

In [5]:
# Split into bigtrain (with labels) and test (missing labels)
data_bigtrain = data[data['SOC'].notna()].copy()
data_test = data[data['SOC'].isna()].copy()
data_test['missing_SOC'] = True

In [6]:
len(data_bigtrain), len(data_test)

(8153, 1132)

In [7]:
# Split the data into training and test sets stratified by pt
train_df, val_df = train_test_split(data_bigtrain, test_size=0.2, stratify=data_bigtrain['SOC'], random_state=42)

In [8]:
# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained('biobert-finetuned')
model = AutoModel.from_pretrained('biobert-finetuned')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [9]:
# Helper to get embedding
def get_embedding(text):
    cleaned_text = clean_text(text)
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0, 0].numpy()

In [10]:
# Generate embeddings
tqdm.pandas(desc='Embedding train')
train_df['embedding'] = train_df['pt'].progress_apply(get_embedding)

tqdm.pandas(desc='Embedding val')
val_df['embedding'] = val_df['pt'].progress_apply(get_embedding)

tqdm.pandas(desc='Embedding test')
data_test['embedding'] = data_test['pt'].progress_apply(get_embedding)

Embedding train: 100%|██████████| 6522/6522 [04:21<00:00, 24.95it/s]
Embedding val: 100%|██████████| 1631/1631 [01:05<00:00, 25.01it/s]
Embedding test: 100%|██████████| 1132/1132 [00:43<00:00, 25.94it/s]


In [11]:
le = LabelEncoder()

# Train k-NN on train set
X_train = np.stack(train_df['embedding'].values)
y_train = le.fit_transform(train_df['SOC'].values)

X_val = np.stack(val_df['embedding'].values)
y_val = le.transform(val_df['SOC'].values)

### Logistic Regression

In [13]:
lr_params = {'C': [0.01, 0.1, 1, 10]}

lr = LogisticRegression(random_state=42, max_iter=1000)
lr_cv = GridSearchCV(lr, lr_params, cv=5, verbose=1, n_jobs=-1)
lr_cv.fit(X_train, y_train)

lr_val_pred = lr_cv.predict(X_val)
lr_val_accuracy = accuracy_score(y_val, lr_val_pred)

print('Best parameters for Logistic Regression:', lr_cv.best_params_)
print('Best score for Logistic Regression:', lr_val_accuracy)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters for Logistic Regression: {'C': 0.1}
Best score for Logistic Regression: 0.9301042305334151


### SVM

In [14]:
svm_params = {'C': [0.01, 0.1, 1, 10],
             'kernel': ['linear', 'rbf', 'poly']}

svm = SVC(probability=True, random_state=42)
svm_cv = GridSearchCV(svm, svm_params, cv=5, verbose=1, n_jobs=-1)
svm_cv.fit(X_train, y_train)

svm_val_pred = svm_cv.predict(X_val)
svm_val_accuracy = accuracy_score(y_val, svm_val_pred)

print('Best parameters for SVM:', svm_cv.best_params_)
print('Best score for SVM:', svm_val_accuracy)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for SVM: {'C': 10, 'kernel': 'rbf'}
Best score for SVM: 0.9294911097486205


### KNN

In [15]:
knn_params = {'n_neighbors': [5, 10, 27],
              'weights': ['uniform', 'distance', 'kernel']}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=5, verbose=1, n_jobs=-1)
knn_cv.fit(X_train, y_train)

knn_val_pred = knn_cv.predict(X_val)
knn_val_accuracy = accuracy_score(y_val, knn_val_pred)

print('Best parameters for KNN:', knn_cv.best_params_)
print('Best score for KNN:', knn_val_accuracy)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters for KNN: {'n_neighbors': 5, 'weights': 'distance'}
Best score for KNN: 0.9288779889638259


### Random Forest

In [16]:
rf_params = {'n_estimators': [100, 200, 500],
             'max_depth': [None, 10, 20],
             'min_samples_leaf': [1, 2, 4]}

rf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(rf, rf_params, cv=5, verbose=1, n_jobs=-1)
rf_cv.fit(X_train, y_train)

rf_val_pred = rf_cv.predict(X_val)
rf_val_accuracy = accuracy_score(y_val, rf_val_pred)

print('Best parameters for Random Forest:', rf_cv.best_params_)
print('Best score for Random Forest:', rf_val_accuracy)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 500}
Best score for Random Forest: 0.9276517473942366


In [17]:
# Create a dictionary of best parameters
best_params = {
    'Model': ['LR', 'SVM', 'KNN', 'RF'],
    'Best Parameters': [
        lr_cv.best_params_,
        svm_cv.best_params_,
        knn_cv.best_params_,
        rf_cv.best_params_
    ],
    'Best Accuracy': [
        lr_val_accuracy,
        svm_val_accuracy,
        knn_val_accuracy,
        rf_val_accuracy
    ]
}

# Turn into a DataFrame
best_params_df = pd.DataFrame(best_params)

# Save best parameters
best_params_df.to_csv('data/ML_BioBERT_finetuned_best_model_parameters.csv', index=False)

In [18]:
# Embedding for test set
test_embeddings = np.stack(data_test['embedding'].values)

# Logistic Regression prediction
lr_test_pred = lr_cv.predict(test_embeddings)
lr_test_pred_labels = le.inverse_transform(lr_test_pred)

# SVM prediction
svm_test_pred = svm_cv.predict(test_embeddings)
svm_test_pred_labels = le.inverse_transform(svm_test_pred)

# KNN prediction
knn_test_pred = knn_cv.predict(test_embeddings)
knn_test_pred_labels = le.inverse_transform(knn_test_pred)

# Random Forest prediction
rf_test_pred = rf_cv.predict(test_embeddings)
rf_test_pred_labels = le.inverse_transform(rf_test_pred)

# Create a DataFrame with predictions for each model
test_predictions = pd.DataFrame({
    'pt': data_test['pt'],
    'SOC': data_test['SOC'],
    'SOC_lr_bb': lr_test_pred_labels,
    'SOC_svm_bb': svm_test_pred_labels,
    'SOC_knn_bb': knn_test_pred_labels,
    'SOC_rf_bb': rf_test_pred_labels
})

In [19]:
# Save predictions to CSV
test_predictions.to_csv('data/4.3 BioBERT_finetuned_ML_predictions.csv', index=False)