In [1]:
!module load cuda/11.6.2
!module load cudnn/8.6.0.163-cuda11
!pip install -q --use-feature=2020-resolver pynvml zstandard datasets psutil transformers torch bitsandbytes accelerate loralib peft
!echo "Hostname: $(hostname)"
!echo "Processor: $(lscpu | grep 'Model name' | awk -F ':' '{print $2}' | xargs)"
!echo "RAM: $(free -h | grep 'Mem:' | awk '{print $4}')"

You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Hostname: gr057.hpc.nyu.edu
Processor: Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
RAM: 187Gi


In [1]:
!echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader)"
!echo "GPU Memory: $(nvidia-smi | grep MiB |  awk '{print $9 $10 $11}')"

GPU: Quadro RTX 8000
GPU Memory: 0MiB/46080MiB


In [2]:
# Change net ID here to use your scratch folder
ENV = "dev"
NET_ID = "vgn2004"
DATA_PATH =  f"/scratch/{NET_ID}/fine_tuning" 
ROOT_PATH = f"/scratch/{NET_ID}/fine_tuning/{ENV}"

# Ensure that packages can be found
import sys
sys.path.insert(0, f"/home/{NET_ID}/.local/lib/python3.8/site-packages")


# Global configurations
config = {
    "DATASET_URL": "https://the-eye.eu/public/AI/pile_v2/data",
    "DATASET_NAME": "NIH_ExPORTER_awarded_grant_text",
    "NUM_WORKERS": 8,
    "DATASET_SPLIT_RATIO": 0.9,
    "PADDING_STRATEGY": "max_length",
    "MAX_TOKENS": 128,
    "MODEL_NAME": "facebook/opt-350m",
    "TOKENIZED_NAME": "opt_350",
    "BATCH_SIZE": 16,
    "NUM_EPOCHS": 30,
    "LEARNING_RATE": 5e-4,
    "MIN_LEARNING_RATE": 5e-5,
    "EPSILON": 1e-8,
    "BETAS": (0.9,0.95),
    "GRADIENT_CLIP": 1.0,
    "WEIGHT_DECAY": 0.01,
    "DECAY_STYLE": "cosine", #not used currently
    "WARMUP_RATIO": 0.003,
    "SAMPLING_INTERVAL": 20,
    "CHECKPOINTING_INTERVAL": 100,
    "VALIDATION_INTERVAL": 500,
    "GRADIENT_ACCUMULATION_STEPS": 4,

}

# Low Rank Adapters config
from peft import LoraConfig, PeftConfig, get_peft_model 
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Weight Quantization config
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_skip_modules=["lm_head"],
    llm_int8_threshold=3.0
)

# Ensure that GPU can be found
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
# os.environ["TORCHDYNAMO_DISABLE"] = "1"

# Setup logging
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s')

# Packages for profiling
import random
import psutil
import time
from tqdm import tqdm
import tqdm.notebook as tq
from pynvml import *

# Packages for data loading
from datasets import load_dataset, load_from_disk, DatasetDict, Dataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Core packages
import torch
torch.backends.cudnn.benchmark = True
torch.cuda.empty_cache()
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(False)
logging.info(f"Is Flash Attention Enabled: {torch.backends.cuda.flash_sdp_enabled()}")
logging.info(f"Is Mem Efficient SDP Enabled: {torch.backends.cuda.mem_efficient_sdp_enabled()}")
logging.info(f"Is Math SDP Enabled: {torch.backends.cuda.math_sdp_enabled()}")



# Get GPU Utilization
def print_gpu_utilization():
    while True:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        info = nvmlDeviceGetMemoryInfo(handle)
        logger.info(f"GPU memory occupied: {info.used//1024**2} MB.")
        time.sleep(10)
    

# Returns RAM usage in MB
def get_ram_usage():
    return psutil.Process().memory_info().rss / (1024 * 1024)

# Returns number of trainable parameters and percentage
def print_trainable_parameters(model):
        trainable_params = 0
        all_param = 0
        for _, param in model.named_parameters():
            all_param += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        logger.info(
            f"Parameters: Trainable- {trainable_params/1e6:.2f}M|| All- {all_param/1e6:.2f}M || Trainable%- {100 * trainable_params / all_param}"
        )

#Takes a batch of inputs and runs the tokenizer on them
def tokenize_function(examples, tokenizer):
    return tokenizer(
        examples["text"],
        padding=config["PADDING_STRATEGY"],
        truncation=True,
        max_length=config["MAX_TOKENS"],
        return_attention_mask=True
    )

# Tokenizes dataset and creates train and validation split
def preprocess_data(dataset, tokenizer):
    tokenized_dataset_path = f"{DATA_PATH}/datasets/tokenized_{config['DATASET_NAME']}_{config['TOKENIZED_NAME']}"
    train_dataset_path = f"{tokenized_dataset_path}_train"
    valid_dataset_path = f"{tokenized_dataset_path}_valid"
    if os.path.exists(train_dataset_path) and os.path.exists(valid_dataset_path):
        logger.info(f"Loading dataset from disk...")
        start_time = time.time()
        train_dataset = load_from_disk(train_dataset_path)
        valid_dataset = load_from_disk(valid_dataset_path)
        elapsed_time = time.time() - start_time
        logger.info(f"Time taken to load dataset from : {elapsed_time:.2f} seconds")
        return train_dataset, valid_dataset
        
    logger.info(f"Tokenizing the dataset...")
    start_time = time.time()
    try:
        tokenized_dataset = load_from_disk(tokenized_dataset_path)
    except Exception as e:
        logging.error(e)
        tokenized_dataset = dataset.map(
            tokenize_function,
            fn_kwargs={'tokenizer': tokenizer},
            batched=True,
            num_proc=8,
            remove_columns=["text", "meta"],
        )
        tokenized_dataset.save_to_disk(tokenized_dataset_path)

    elapsed_time = time.time() - start_time
    logger.info(f"Time taken to tokenize the dataset: {elapsed_time:.2f} seconds")

    logger.info(f"Splitting the dataset...")
    start_time = time.time()
    
    if os.path.exists(train_dataset_path) and os.path.exists(valid_dataset_path):
        train_dataset = load_from_disk(train_dataset_path)
        valid_dataset = load_from_disk(valid_dataset_path)
    else:
        train_size = int(config["DATASET_SPLIT_RATIO"] * len(tokenized_dataset))
        datasets = DatasetDict({
            'train': Dataset.from_dict(tokenized_dataset[:train_size]),
            'valid': Dataset.from_dict(tokenized_dataset[train_size:])
        })
        train_dataset = datasets['train']
        valid_dataset = datasets['valid']
        train_dataset.save_to_disk(train_dataset_path)
        valid_dataset.save_to_disk(valid_dataset_path)
    elapsed_time = time.time() - start_time
    logger.info(f"Time taken to split the datasets (or load pre-split datasets): {elapsed_time:.2f} seconds")
    
    return train_dataset, valid_dataset

# Creates data loaders
def create_dataloaders(train_dataset, valid_dataset, data_collator):
    logger.info(f"Creating data loaders...")
    start_time = time.time()
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=config["BATCH_SIZE"],
                                  num_workers=config["NUM_WORKERS"],
                                  collate_fn=data_collator,
                                  pin_memory=True)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=SequentialSampler(valid_dataset),
                                  batch_size=config["BATCH_SIZE"],
                                  num_workers=config["NUM_WORKERS"],
                                  collate_fn=data_collator,
                                  pin_memory=True)
    elapsed_time = time.time() - start_time
    logging.info(f"Time taken to create data loaders: {elapsed_time:.2f} seconds")
    return train_dataloader, valid_dataloader

# Fetches tokenizer relevant to the model
def create_or_load_tokenizer(checkpointed_path=None):
    if checkpointed_path:
        tokenizer = AutoTokenizer.from_pretrained(checkpointed_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(config["MODEL_NAME"], cache_dir=f"{DATA_PATH}/datasets")
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = 'left'
    return tokenizer

# Data preparation
def run_data_pipeline(tokenizer, load_from_file=False):
    # Measure how much RAM is being used before anything runs
    ram_usage = get_ram_usage()
    logging.info(f"Baseline: RAM used: {ram_usage:.2f} MB")

    # Load data, either from url or from datasets folder
    data_file_url = f"{config['DATASET_URL']}/{config['DATASET_NAME']}.jsonl.zst"
    try:
        if load_from_file:
            raise Exception
        dataset = load_dataset("json",
                               data_files=data_file_url,
                               num_proc=config["NUM_WORKERS"],
                               split="train",
                               cache_dir=f"{DATA_PATH}/datasets")
    except Exception as e:
        logging.error(e)
        dataset = load_dataset("json",
                               data_files=f"{DATA_PATH}/datasets/{config['DATASET_NAME']}.jsonl.zst",
                               num_proc=config["NUM_WORKERS"],
                               split="train",
                               cache_dir=f"{DATA_PATH}/datasets")

    # Measurements relevant to the dataset
    ram_usage = get_ram_usage()
    logging.info(f"RAM used: {ram_usage:.2f} MB")
    logging.info(f"Dataset sample: {dataset[10]}")
    size_gb = dataset.dataset_size / (1024 ** 3)
    logging.info(f"Dataset size (cache file) : {size_gb:.2f} GB")

    # Fetch a tokenizer and tokenize + split the dataset
    train_dataset, valid_dataset = preprocess_data(dataset, tokenizer)

    # Create a data collator and use it to make data loaders
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    train_dataloader, valid_dataloader = create_dataloaders(train_dataset, valid_dataset, data_collator)

    return {
        "TRAIN_DATASET": train_dataset,
        "VALIDATION_DATASET": valid_dataset,
        "TRAIN_DATALOADER": train_dataloader,
        "VALIDATION_DATALOADER": valid_dataloader,
        "TOKENIZER": tokenizer,
        "DATA_COLLATOR": data_collator
    }


# Create model
def create_or_load_model(checkpointed_path=None, frozen=True, cast_layer_norm_to_fp32=True, cast_output_to_fp32=True):
    class CastOutputToFloat(torch.nn.Sequential):
        def forward(self, x): return super().forward(x).to(torch.float32)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if checkpointed_path:
        model = AutoModelForCausalLM.from_pretrained(checkpointed_path)
        model.to(device)
    else:
        configuration = AutoConfig.from_pretrained(config["MODEL_NAME"])
        model = AutoModelForCausalLM.from_pretrained(config["MODEL_NAME"], config=configuration, load_in_8bit=True, device_map='auto', quantization_config=quantization_config)

        if frozen:
            for param in model.parameters():
                param.requires_grad = False
                
        if cast_layer_norm_to_fp32:
            for param in model.parameters():
                if param.ndim == 1:
                    param.data = param.data.to(torch.float32)
                
    #Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    
    if cast_output_to_fp32:
        model.lm_head = CastOutputToFloat(model.lm_head)
    
    # Log details
    logger.info(f"Model: {config['MODEL_NAME']}")
    print_trainable_parameters(model)
    logger.info(f"Memory Memory Footprint: {model.get_memory_footprint() / 1e6:,} MB")
    logger.info(f"Model is on device: {model.device}")
    
    model.config.use_cache = False
    return model, device

# Use the model to generate text
def generate(model, inputs):
    output_sequence = model.generate(
        **inputs,
        bos_token_id=tokenizer.bos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        max_length=config["MAX_TOKENS"],
        top_p=0.95,
        num_return_sequences=1
    )
    return output_sequence
    
def inference(model, tokenizer, device, max_tokens=128):
    # Put the model in eval mode and enable caching
    model.config.use_cache = True
    model.eval()
    
    inputs = tokenizer(tokenizer.eos_token+"This", return_tensors="pt").to(device)
    # Generate a sequence of text tokens
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            output_sequence = generate(model, inputs)
        

    # Decode the tokens to text
    generated_text = tokenizer.decode(output_sequence[0], 
                                      clean_up_tokenization_spaces=True,
                                      skip_special_tokens=True).replace('\n', '').replace('\t', ' ')

    # Put the model back into train mode and disable caching
    model.train()
    model.config.use_cache = False
    
    return generated_text

    
# Custom training function
def custom_training_function(trainer, model, inputs):
    model.train()
    inputs = {k: v.to(trainer.args.device) for k, v in inputs.items()}
    outputs = model(**inputs)
    loss = outputs.loss
    return loss

# Custom evaluation function
def custom_evaluation_function(trainer, model, inputs):
    model.eval()
    inputs = {k: v.to(trainer.args.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        loss = outputs.loss
    return loss

if __name__=="__main__":
    # start GPU utilization monitoring
    import threading
    gpu_monitor_thread = threading.Thread(target=print_gpu_utilization)
    gpu_monitor_thread.start()
    
    checkpointed_path = None
    tokenizer = create_or_load_tokenizer(checkpointed_path=checkpointed_path)
    data_dict = run_data_pipeline(tokenizer, load_from_file=True)

    model, device = create_or_load_model(checkpointed_path=checkpointed_path, 
                                         frozen=True,
                                         cast_layer_norm_to_fp32=True,
                                         cast_output_to_fp32=True)
    model = get_peft_model(model, lora_config)
    logger.info(f"Peft Model: {config['MODEL_NAME']}")
    print_trainable_parameters(model)
    logger.info(f"Memory Memory Footprint: {model.get_memory_footprint() / 1e6:,} MB")
    logger.info(f"Model is on device: {model.device}")
    logger.info(f"Iterations: {len(data_dict['TRAIN_DATALOADER'])*config['NUM_EPOCHS']}")
    
    training_args = TrainingArguments(
    output_dir=f"{ROOT_PATH}/models",
    overwrite_output_dir=True,
    num_train_epochs=config["NUM_EPOCHS"],
    per_device_train_batch_size=config["BATCH_SIZE"],
    per_device_eval_batch_size=config["BATCH_SIZE"],
    gradient_checkpointing=True,
    gradient_accumulation_steps=config["GRADIENT_ACCUMULATION_STEPS"],
    learning_rate=config["LEARNING_RATE"],
    weight_decay=0.01,
    fp16=True,
    warmup_ratio=0.05,
    log_level="info",
    logging_dir=f"{ROOT_PATH}/logs",
    logging_steps=config["SAMPLING_INTERVAL"],
    report_to="none",
    disable_tqdm=False,
    dataloader_num_workers=8,
    evaluation_strategy="steps",
    save_steps=config["CHECKPOINTING_INTERVAL"],
    eval_steps=config["VALIDATION_INTERVAL"],
    prediction_loss_only=True,
    dataloader_drop_last=True
    )
    
#     training_args = training_args.set_optimizer(name="adafactor", lr=config["LEARNING_RATE"], scale_parameter=False, relative_step=False, warmup_init=False)

    # Create the Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=data_dict["TRAIN_DATALOADER"].dataset,
        eval_dataset=data_dict["VALIDATION_DATALOADER"].dataset,
        data_collator=data_dict["DATA_COLLATOR"],
        compute_metrics=None
    )

    #Train the model using the Trainer object
    trainer.train()


2023-05-01 06:31:28,128 - INFO - <module>:90 - Is Flash Attention Enabled: True
2023-05-01 06:31:28,128 - INFO - <module>:91 - Is Mem Efficient SDP Enabled: False
2023-05-01 06:31:28,129 - INFO - <module>:92 - Is Math SDP Enabled: False
2023-05-01 06:31:28,131 - INFO - print_gpu_utilization:102 - GPU memory occupied: 4314 MB.
loading configuration file config.json from cache at /scratch/vgn2004/fine_tuning/datasets/models--facebook--opt-350m/snapshots/cb32f77e905cccbca1d970436fb0f5e6b58ee3c5/config.json
Model config OPTConfig {
  "_name_or_path": "facebook/opt-350m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "do_layer_norm_before": false,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 4096,
  "hidden_size": 1024,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "max_positi

loading weights file pytorch_model.bin from cache at /home/vgn2004/.cache/huggingface/hub/models--facebook--opt-350m/snapshots/cb32f77e905cccbca1d970436fb0f5e6b58ee3c5/pytorch_model.bin
Instantiating OPTForCausalLM model under default dtype torch.float16.
Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 2,
  "eos_token_id": 2,
  "pad_token_id": 1,
  "transformers_version": "4.28.1"
}

Detected 8-bit loading: activating 8-bit loading for this model
All model checkpoint weights were used when initializing OPTForCausalLM.

All the weights of OPTForCausalLM were initialized from the model checkpoint at facebook/opt-350m.
If your task is similar to the task the model of the checkpoint was trained on, you can already use OPTForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /home/vgn2004/.cache/huggingface/hub/models--facebook--opt-350m/snapshots/cb32f77e905cccbca1d970436fb0f5e6b58ee3c5/gener

Step,Training Loss,Validation Loss


2023-05-01 06:31:37,984 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5196 MB.
2023-05-01 06:31:38,142 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5196 MB.
2023-05-01 06:31:47,995 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5198 MB.
2023-05-01 06:31:48,152 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5198 MB.
2023-05-01 06:31:58,007 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5198 MB.
2023-05-01 06:31:58,179 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5198 MB.
2023-05-01 06:32:08,018 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5198 MB.
2023-05-01 06:32:08,191 - INFO - print_gpu_utilization:102 - GPU memory occupied: 5198 MB.


KeyboardInterrupt: 

In [None]:
# total number of training iterations
# lr_decay_iters = max_iters # should be ~= max_iters per Chinchilla
# grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# warmup_iters = 0.003*max_iters # how many steps to warm up for



#         # Create the TrainingArguments object
#         training_args = TrainingArguments(
#             output_dir=f"{ROOT_PATH}/models",
#             overwrite_output_dir=True,
#             num_train_epochs=config["NUM_EPOCHS"],
#             per_device_train_batch_size=config["BATCH_SIZE"],
#             per_device_eval_batch_size=config["BATCH_SIZE"],
#             gradient_checkpointing=True,
#             auto_find_batch_size=True,
#             gradient_accumulation_steps=config["GRADIENT_ACCUMULATION_STEPS"],
#             learning_rate=config["LEARNING_RATE"],
#             weight_decay=0.01,
#             fp16=True,
#             fp16_full_eval=True,
#             warmup_ratio=0.05,
#             log_level="info",
#             logging_dir=f"{ROOT_PATH}/logs",
#             logging_steps=config["SAMPLING_INTERVAL"],
#             report_to="none",
#             disable_tqdm=False,
#             dataloader_num_workers=8,
#             evaluation_strategy="steps",
#             save_steps=config["CHECKPOINTING_INTERVAL"],
#             eval_steps=config["VALIDATION_INTERVAL"],
#             prediction_loss_only=True,
#             dataloader_drop_last=True,
#         )
#         training_args = training_args.set_optimizer(name="adafactor", lr=config["LEARNING_RATE"], scale_parameter=False, relative_step=False, warmup_init=False)

#         # Create the Trainer object
#         trainer = Trainer(
#             model=model,
#             args=training_args,
#             train_dataset=data_dict["TRAIN_DATALOADER"].dataset,
#             eval_dataset=data_dict["VALIDATION_DATALOADER"].dataset,
#             data_collator=data_dict["DATA_COLLATOR"],
#             compute_loss=custom_training_function,
#             compute_metrics=None,
#         )

        # Train the model using the Trainer object
#         trainer.train()
#     elif config["PEFT"]:
#         model, device = create_or_load_model(checkpointed_path=checkpointed_path, cast_layer_norm_to_fp32=True)
        
#         # Model compilation
#         # model = torch.compile(model)
        
#         generated_text = inference(model, tokenizer, device)
#         logging.info(f"Initial Text:\n{generated_text}")

#         train(model, device, data_dict)