In [1]:
# !pip install datasets==1.18.3 python-dotenv==0.19.2
# !pip install tokenizers==0.19
# !pip install accelerate

In [2]:
!nvidia-smi

Sat Nov  9 20:09:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  | 00000000:81:00.0 Off |                    0 |
| N/A   37C    P0              45W / 300W |      4MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# !kill -9 65039

In [1]:
import os
os.environ['HF_HOME'] = '/scratch/gilbreth/dparveez/'
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
import gc

gc.collect()

0

In [3]:
!export HF_HOME=/scratch/gilbreth/dparveez/
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [4]:
BASE_PATH = '/scratch/gilbreth/dparveez/'

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from util import nethook
from util.generate import generate_interactive, generate_fast

from experiments.py.demo import demo_model_editing, stop_execution

In [6]:
MODEL_NAME = "gpt2-xl"

In [10]:
model, tok = (
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, low_cpu_mem_usage=False, cache_dir=BASE_PATH).to(
        "cuda"
    ),
    AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=BASE_PATH),
)
tok.pad_token = tok.eos_token
model.config

GPT2Config {
  "_name_or_path": "gpt2-xl",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [11]:
with open('pfw.txt', 'r') as f:
    data = f.read()

In [12]:
# Adjust chunk_size based on model's max input size
# Adjust stride based on desired overlap
def chunkify(text, chunk_size=1024, stride=256):
    chunks = []
    for i in range(0, len(text), stride):
        chunk = text[i:i + chunk_size]
        if len(chunk) == chunk_size:
            chunks.append(chunk)
    return chunks

text_chunks = chunkify(data)

In [13]:
encoded_chunks = tok(text_chunks, truncation=True, padding=True, return_tensors="pt")

In [14]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = TextDataset(encoded_chunks)

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=os.path.join(BASE_PATH, "results"),
    num_train_epochs=75,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=750,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir=os.path.join(BASE_PATH, "logs"),
    logging_steps=50,
    save_steps=500,                   # Save checkpoint every 500 steps
    save_total_limit=1,
    gradient_accumulation_steps=8,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

Step,Training Loss
50,4.8829


In [None]:
trainer.save_model(os.path.join(BASE_PATH, "fine_tuned_model_2/"))

In [None]:
tok.save_pretrained(os.path.join(BASE_PATH, "fine_tuned_model_2/"))