In [1]:
# Install necessary packages
!pip install accelerate -U > /dev/null 2>&1
!pip install transformers datasets > /dev/null 2>&1
!pip install torch > /dev/null 2>&1

In [2]:
# Import required libraries
import pandas as pd
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [3]:
# Create necessary directories for combined text and model output
os.makedirs('content/combined_text', exist_ok=True)
os.makedirs('content/model_output', exist_ok=True)

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load and preprocess the SDoH dataset
def preprocess_and_split_sdoh_dataset(csv_file_path, train_output_file_path, test_output_file_path):
    df = pd.read_csv(csv_file_path)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    def write_data(df, output_path):
        with open(output_path, 'w') as f:
            for _, row in df.iterrows():
                line = f"Get the SDOH_LABEL and ADVERSE_LABEL from the following clinical note: {row['text']} \n[SDOH_LABEL]: {row['label']} [ADVERSE_LABEL]: {row['adverse']}\n"
                f.write(line)
    write_data(train_df, train_output_file_path)
    write_data(test_df, test_output_file_path)

    # Save the training and testing datasets as CSV files
    train_df.to_csv(train_csv_output_path, index=False)
    test_df.to_csv(test_csv_output_path, index=False)

csv_file_path = 'Iteration__1.csv'
train_output_path = 'content/combined_text/train.txt'
test_output_path = 'content/combined_text/test.txt'
train_csv_output_path = 'content/combined_text/train_data.csv'
test_csv_output_path = 'content/combined_text/test_data.csv'
preprocess_and_split_sdoh_dataset(csv_file_path, train_output_path, test_output_path)


In [5]:
from torch.utils.data import Dataset
import torch
from transformers import GPT2Tokenizer

class CustomTextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        self.tokenizer = tokenizer
        self.samples = []

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                tokens = tokenizer.encode(line, add_special_tokens=True, truncation=True, max_length=block_size)
                self.samples.append(tokens)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return torch.tensor(self.samples[idx], dtype=torch.long)


In [6]:

from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments


In [7]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [8]:
# Returns the configured data collator object
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,# Sets masked language modeling based on the mlm flag
    )
    return data_collator

In [9]:
def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    train_dataset = CustomTextDataset(train_file_path, tokenizer, block_size=128)


    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file_path,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    tokenizer.save_pretrained(output_dir)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    model.resize_token_embeddings(len(tokenizer))

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
        save_total_limit=3,
        # logging_dir='./logs',  # Directory for storing logs
        logging_steps=100,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()


# Define training parameters and initiate training
train_file_path = 'content/combined_text/train.txt'
model_name = 'gpt2-large'
output_dir = 'content/model_output/sdoh_extracotor_gpt'
overwrite_output_dir = True
per_device_train_batch_size = 16  # based on GPU/CPU memory
num_train_epochs = 100  
save_steps = 50
save_total_limit=3
# Initiate Training
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,0.5863
200,0.1473
300,0.0621
400,0.0314
500,0.0247
600,0.023
700,0.0196
800,0.0185
900,0.0175
1000,0.0164


In [10]:
!zip -r model_output.zip content/model_output

  adding: content/model_output/ (stored 0%)
  adding: content/model_output/sdoh_extracotor_gpt/ (stored 0%)
  adding: content/model_output/sdoh_extracotor_gpt/vocab.json (deflated 68%)
  adding: content/model_output/sdoh_extracotor_gpt/config.json (deflated 51%)
  adding: content/model_output/sdoh_extracotor_gpt/added_tokens.json (stored 0%)
  adding: content/model_output/sdoh_extracotor_gpt/runs/ (stored 0%)
  adding: content/model_output/sdoh_extracotor_gpt/runs/Mar31_23-41-37_8c36b875607d/ (stored 0%)
  adding: content/model_output/sdoh_extracotor_gpt/runs/Mar31_23-41-37_8c36b875607d/events.out.tfevents.1711928498.8c36b875607d.2159.0 (deflated 62%)
  adding: content/model_output/sdoh_extracotor_gpt/training_args.bin (deflated 51%)
  adding: content/model_output/sdoh_extracotor_gpt/generation_config.json (deflated 24%)
  adding: content/model_output/sdoh_extracotor_gpt/checkpoint-2100/ (stored 0%)
  adding: content/model_output/sdoh_extracotor_gpt/checkpoint-2100/optimizer.pt (deflat

In [11]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [12]:
!mkdir -p '/drive/My Drive/NLP'

!cp /content/model_output.zip '/content/drive/My Drive/NLP/'