In [1]:
import pandas as pd
import torch
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and prepare data
df = pd.read_csv('data/faers_adalimumab_2020-2024_ungrouped_cleaned_2.csv', low_memory=False)

In [3]:
data = df[['pt', 'SOC']].copy()
data = data.drop_duplicates(subset=['pt', 'SOC'])

In [4]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation/special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

data['pt_cleaned'] = data['pt'].apply(clean_text)

In [5]:
# Split into bigtrain (with labels) and test (missing labels)
data_bigtrain = data[data['SOC'].notna()].copy()
data_test = data[data['SOC'].isna()].copy()
data_test['missing_SOC'] = True

In [6]:
len(data_bigtrain), len(data_test)

(7880, 1083)

In [7]:
# Train/Validation split stratified by pt
train_df, val_df = train_test_split(data_bigtrain, test_size=0.2, stratify=data_bigtrain['SOC'], random_state=42)

In [8]:
# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [9]:
# Helper to get embedding
def get_embedding(text):
    cleaned_text = clean_text(text)
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0, 0].numpy()

In [11]:
# Generate embeddings
tqdm.pandas(desc="Embedding train")
train_df['embedding'] = train_df['pt'].progress_apply(get_embedding)

tqdm.pandas(desc="Embedding val")
val_df['embedding'] = val_df['pt'].progress_apply(get_embedding)

tqdm.pandas(desc="Embedding test")
data_test['embedding'] = data_test['pt'].progress_apply(get_embedding)

Embedding train: 100%|██████████| 6304/6304 [04:26<00:00, 23.64it/s]
Embedding val: 100%|██████████| 1576/1576 [01:05<00:00, 23.93it/s]
Embedding test: 100%|██████████| 1083/1083 [00:43<00:00, 24.80it/s]


In [12]:
# Save embeddings
train_df.to_pickle('train_embeddings.pkl')
val_df.to_pickle('val_embeddings.pkl')
data_test.to_pickle('test_embeddings.pkl')

In [10]:
# Load embeddings
train_df = pd.read_pickle('train_embeddings.pkl')
val_df = pd.read_pickle('val_embeddings.pkl')

In [11]:
le = LabelEncoder()

# Train k-NN on train set
X_train = np.stack(train_df['embedding'].values)
# y_train = train_df['SOC'].values
y_train = le.fit_transform(train_df['SOC'].values)

X_val = np.stack(val_df['embedding'].values)
# y_val = val_df['SOC'].values
y_val = le.transform(val_df['SOC'].values)

### Logistic Regression

In [12]:
lr_params = [
    {'C': [0.01, 0.1, 1, 10], 'penalty': ['l2', 'none'], 'solver': ['lbfgs', 'sag']},
    {'C': [0.01, 0.1, 1, 10], 'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.5]}
]

lr = LogisticRegression(random_state=42, max_iter=1000)
lr_cv = GridSearchCV(lr, lr_params, cv=5, verbose=1, n_jobs=-1)
# lr_cv = GridSearchCV(lr, lr_params, cv=5, verbose=1)
lr_cv.fit(X_train, y_train)

lr_val_pred = lr_cv.predict(X_val)
lr_val_accuracy = accuracy_score(y_val, lr_val_pred)

print('Best parameters for Logistic Regression:', lr_cv.best_params_)
print('Best score for Logistic Regression:', lr_val_accuracy)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_co

Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score for Logistic Regression: 0.8343908629441624


### SVM

In [13]:
svm_params = {'C': [0.01, 0.1, 1, 10],
             'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 
             'gamma': ['scale', 'auto']}

svm = SVC(probability=True, random_state=42)
# svm_cv = GridSearchCV(svm, svm_params, cv=5, verbose=1, n_jobs=-1)
svm_cv = GridSearchCV(svm, svm_params, cv=5, verbose=1)
svm_cv.fit(X_train, y_train)

svm_val_pred = svm_cv.predict(X_val)
svm_val_accuracy = accuracy_score(y_val, svm_val_pred)

print('Best parameters for SVM:', svm_cv.best_params_)
print('Best score for SVM:', svm_val_accuracy)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
Best score for SVM: 0.8369289340101523


### KNN

In [13]:
knn_params = {'n_neighbors': [5, 10, 27],
              'weights': ['uniform', 'distance', 'kernel'], 
              'metric': ['euclidean', 'manhattan']}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=5, verbose=2, n_jobs=-1)
# knn_cv = GridSearchCV(knn, knn_params, cv=5, verbose=1)
knn_cv.fit(X_train, y_train)

knn_val_pred = knn_cv.predict(X_val)
knn_val_accuracy = accuracy_score(y_val, knn_val_pred)

print('Best parameters for KNN:', knn_cv.best_params_)
print('Best score for KNN:', knn_val_accuracy)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
21 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\nguye\anaconda3\envs\NLP\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_co

Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Best score for KNN: 0.7709390862944162


### Random Forest

In [14]:
rf_params = {'n_estimators': [100, 200, 500],
             'max_depth': [None, 10, 20, 30],
             'min_samples_leaf': [1, 2, 4], 
             'min_samples_split': [2, 5, 10]}

rf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(rf, rf_params, cv=5, verbose=2, n_jobs=-1)
# rf_cv = GridSearchCV(rf, rf_params, cv=5, verbose=1)
rf_cv.fit(X_train, y_train)

rf_val_pred = rf_cv.predict(X_val)
rf_val_accuracy = accuracy_score(y_val, rf_val_pred)

print('Best parameters for Random Forest:', rf_cv.best_params_)
print('Best score for Random Forest:', rf_val_accuracy)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for Random Forest: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Best score for Random Forest: 0.7576142131979695


### XGBoost

In [12]:
xgb_params = {'n_estimators': [100, 200, 500],
              'max_depth': [3, 5, 7], 
              'learning_rate': [0.01, 0.1, 0.2]}

xgb = XGBClassifier(random_state=42)
# xgb_cv = GridSearchCV(xgb, xgb_params, cv=5, verbose=1, n_jobs=-1)
xgb_cv = GridSearchCV(xgb, xgb_params, cv=5, verbose=1)
xgb_cv.fit(X_train, y_train)

xgb_val_pred = xgb_cv.predict(X_val)
xgb_val_accuracy = accuracy_score(y_val, xgb_val_pred)

print('Best parameters for XGBoost:', xgb_cv.best_params_)
print('Best score for XGBoost:', xgb_val_accuracy)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Best score for XGBoost: 0.7887055837563451
