## Objective
This folder is dedicated to extracting embeddings from the text columns and running PCA on the embeddings.

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
from tqdm import tqdm



In [None]:
train = pd.read_csv('imputed_train_r2_l1_text_corrected.csv')


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
biobert_tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
biobert_model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1').to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
def extract_biobert_embeddings(texts, biobert_tokenizer, biobert_model, batch_size=8, device='cuda'):
    """
    Extract embeddings using BioBERT for a list of texts.

    Args:
        texts (list): List of input texts.
        biobert_tokenizer: BioBERT tokenizer.
        biobert_model: BioBERT model.
        batch_size (int): Number of texts to process per batch.
        device (str): Device to use ('cuda' or 'cpu').

    Returns:
        torch.Tensor: Tensor containing embeddings for all input texts.
    """
    biobert_model = biobert_model.to(device)  # Move the model to the specified device
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BioBERT embeddings"):
        # Get the current batch
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch of texts
        encodings = biobert_tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            return_tensors='pt',
            max_length=512
        )
        encodings = {key: val.to(device) for key, val in encodings.items()}  # Move inputs to the device

        # Extract embeddings using the BioBERT model
        with torch.no_grad():
            outputs = biobert_model(**encodings)

        # Take the mean of the last hidden state to get sentence embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings.cpu())  # Move embeddings back to CPU

    return torch.cat(all_embeddings, dim=0)  # Concatenate all embeddings

train_embeddings_biobert = extract_biobert_embeddings(train['text_data'].tolist(), biobert_tokenizer, biobert_model)

#



Extracting BioBERT embeddings: 100%|██████████| 342/342 [00:19<00:00, 17.88it/s]


In [None]:
# Convert embeddings to pandas DataFrame
train_embeddings_df = pd.DataFrame(train_embeddings_biobert.numpy())
# test_embeddings_df = pd.DataFrame(test_embeddings_biobert.numpy())

In [None]:
train_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.030295,-0.185693,-0.041920,-0.028763,0.181692,0.062961,0.167553,0.056802,0.057293,0.064217,...,0.065555,-0.001474,-0.004726,-0.112198,0.220956,-0.205587,-0.115361,0.109241,0.183394,0.169293
1,0.029237,-0.205409,-0.034579,-0.046131,0.177742,0.083327,0.128229,0.062446,0.082390,0.058097,...,0.054916,0.005898,-0.013933,-0.092124,0.219298,-0.199484,-0.145725,0.101959,0.170385,0.161591
2,0.012688,-0.233941,-0.041713,0.009620,0.083135,0.033530,0.118157,-0.003653,0.031487,0.107339,...,0.042694,0.076358,-0.061401,-0.134511,0.209216,-0.220635,-0.146594,0.066166,0.142969,0.111446
3,0.043633,-0.227122,-0.027773,-0.018722,0.108884,0.054514,0.085587,0.021788,0.076531,0.080204,...,0.086524,0.021252,-0.071504,-0.086410,0.219678,-0.245016,-0.150729,0.070722,0.146022,0.122570
4,0.024080,-0.183231,-0.045142,0.008250,0.148142,0.084370,0.109926,0.050109,0.130002,0.071873,...,0.056932,-0.013197,-0.041921,-0.097623,0.187800,-0.193726,-0.125712,0.062733,0.179500,0.133031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,0.008264,-0.166497,-0.029789,-0.011737,0.157884,0.070857,0.191504,0.014247,0.069745,0.068532,...,0.109878,-0.004359,-0.037844,-0.083892,0.184512,-0.205859,-0.116673,0.103203,0.138947,0.132339
2732,0.030093,-0.197604,-0.083435,-0.004301,0.144002,0.095654,0.113957,0.037428,0.073651,0.068482,...,0.049582,-0.014608,-0.055376,-0.106100,0.195165,-0.207551,-0.146356,0.101821,0.199863,0.113923
2733,-0.003842,-0.150364,-0.035759,0.021719,0.132384,0.076102,0.164434,0.011079,0.125622,0.081141,...,0.100697,0.015148,-0.035433,-0.064983,0.172464,-0.198002,-0.085270,0.075628,0.138459,0.128087
2734,0.029237,-0.205409,-0.034579,-0.046131,0.177742,0.083327,0.128229,0.062446,0.082390,0.058097,...,0.054916,0.005898,-0.013933,-0.092124,0.219298,-0.199484,-0.145725,0.101959,0.170385,0.161591


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
def run_pca_on_embeddings(embeddings, variance_threshold=0.9):
    # Standardize the embeddings (important for PCA)
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    # Perform PCA and retain the specified amount of variance
    pca = PCA(n_components=variance_threshold)
    pca_embeddings = pca.fit_transform(scaled_embeddings)

    # Return the PCA components and the amount of variance explained
    explained_variance = pca.explained_variance_ratio_.sum()  # Total variance explained
    print(f"Explained Variance: {explained_variance * 100:.2f}%")

    # Return the transformed embeddings as a DataFrame
    pca_df = pd.DataFrame(pca_embeddings)
    return pca_df


train_pca_embeddings = run_pca_on_embeddings(train_embeddings_df)
# test_pca_embeddings = run_pca_on_embeddings(test_embeddings_df)

Explained Variance: 91.32%


In [None]:
train_pca_embeddings.columns = ['BERT1', 'BERT2', 'BERT3','BERT4','BERT5','BERT6','BERT7','BERT8','BERT9','BERT10']

In [None]:
final_df.columns

Index(['id', 'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
       'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
       'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
       'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
       'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
       'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
       'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST'

In [None]:
final_df = pd.concat([train, train_pca_embeddings], axis=1)

In [None]:
final_df.to_csv('imputed_train_r2_l1_no_cat_emb_biobert_corrected.csv', index=False)