While developing the code/debugging, connect to **CPU**, so that you don't use up all of your GPU resources. After making sure that the code is running, you can connect to GPU and run your full experiments there. The GPU is only used for the embeddings extraction (to speed up the code), but you can also not use it if you don't want to.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import torch

2024-10-30 05:22:58.064346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730262178.076214  954268 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730262178.079887  954268 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-30 05:22:58.091858: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
# First, load the provided files to Colab in the Files section (left toolbar)
train_set = pd.read_csv("data/train_set.csv")
test_set = pd.read_csv("data/test_set.csv")

In [4]:
X_train_tabular = train_set.drop(columns=["target", "text"])
X_test_tabular = test_set.drop(columns=["target", "text"])

y_train = train_set["target"]
y_test = test_set["target"]

In [12]:
### Step 1: XGBoost on Tabular Data Only ###

# XGBoost on tabular data
xgb_tabular = xgb.XGBClassifier(objective='multi:softmax', num_class=4, eta=0.1, max_depth = 5, n_estimators=100)

# Fit the model
# # TODO
# xgb_tabular.fit(X_train_tabular, y_train)
gridsearchxgb = GridSearchCV(xgb_tabular, {'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 200]}, verbose=1)
gridsearchxgb.fit(X_train_tabular, y_train)
print(gridsearchxgb.best_params_)
xgb_tabular = gridsearchxgb.best_estimator_
xgb_tabular.fit(X_train_tabular, y_train)

# Evaluate on test set
# TODO
y_pred = xgb_tabular.predict(X_test_tabular)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Fitting 5 folds for each of 9 candidates, totalling 45 fits
{'max_depth': 3, 'n_estimators': 100}
Accuracy: 0.562


In [6]:
### Step 2: BERT Embeddings for Text Data ###

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
# Function to extract BERT embeddings
def get_bert_embeddings(texts, tokenizer, bert_model):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Move inputs to GPU
    inputs = {key: val.to(device) for key, val in inputs.items()}

    bert_model.eval()
    outputs = bert_model(inputs['input_ids'])
    # Use the mean of the last hidden state as the embeddings
    return np.mean(outputs.last_hidden_state.detach().cpu().numpy(), axis=1)

In [8]:
def create_embeddings(df):
    """
    Parameters::
        df: DataFrame with a column named "text"

    Returns::
        emb_df: DataFrame with 768 columns; each row contains the embeddings for the text in the corresponding row of df.
    """
    embeddings = []

    # Loop through the rows of the dataframe. Pass the text through the bert model and get embeddings using the get_bert_embeddings function
    for i in tqdm(range(0, df.shape[0])):
        text = df.iloc[i]["text"]
        full_embedding = get_bert_embeddings(texts =  text,tokenizer= tokenizer, bert_model=bert_model) # TODO
        embeddings.append(full_embedding.flatten())
    emb_df =  pd.DataFrame(np.array(embeddings), columns=[f"emb_{i}" for i in range(768)])

    emb_df = emb_df.set_index(df.index)

    return emb_df

In [13]:
# Apply BERT embeddings on train and test text data
X_train_text_bert = create_embeddings(train_set)
X_test_text_bert = create_embeddings(test_set)

### Step 3: XGBoost on Notes Data Only ###

# XGBoost on text data
xgb_text = xgb.XGBClassifier(objective='multi:softmax', num_class=4, eta=0.1, max_depth = 5, n_estimators=100)

# Fit the model
gridsearchxgb = GridSearchCV(xgb_text, {'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 200]}, verbose=1)
gridsearchxgb.fit(X_train_text_bert, y_train)
print(gridsearchxgb.best_params_)
xgb_text = gridsearchxgb.best_estimator_
xgb_text.fit(X_train_text_bert, y_train)

# Evaluate on test set
y_pred_bert = xgb_text.predict(X_test_text_bert)

accuracy_bert = accuracy_score(y_test, y_pred_bert)

print(f"Accuracy with BERT embeddings: {accuracy_bert}")

100%|██████████| 5000/5000 [00:39<00:00, 127.62it/s]
100%|██████████| 1000/1000 [00:07<00:00, 128.24it/s]


Fitting 5 folds for each of 9 candidates, totalling 45 fits
{'max_depth': 3, 'n_estimators': 50}
Accuracy with BERT embeddings: 0.561


In [14]:
### Step 4: Combined Tabular and BERT Embeddings ###

# Combine tabular features and BERT embeddings
X_train_combined = pd.concat([X_train_tabular, X_train_text_bert], axis=1)
X_test_combined = pd.concat([X_test_tabular, X_test_text_bert], axis=1)

# XGBoost on combined data
xgb_combined = xgb.XGBClassifier(objective='multi:softmax', num_class=4, eta=0.1, max_depth = 5, n_estimators=100)

# Fit the model
gridsearchxgb = GridSearchCV(xgb_combined, {'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 200]}, verbose=1)
gridsearchxgb.fit(X_train_combined, y_train)
print(gridsearchxgb.best_params_)
xgb_combined = gridsearchxgb.best_estimator_
xgb_combined.fit(X_train_combined, y_train)

# Evaluate on test set
accuracy_combined = accuracy_score(y_test, xgb_combined.predict(X_test_combined))

print(f"Accuracy with combined data: {accuracy_combined}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
{'max_depth': 3, 'n_estimators': 50}
Accuracy with combined data: 0.589


By performing a grid search, the performance of the xgboost model improves a bit compared to single modality models. That means that the two different modalities capture different information of about the data, that will be used by xgboost to have better predictions.