# Install Necessary packages


In [1]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.2-py3-none-any.whl.metadata (9.8 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp39-cp39-win_amd64.whl.metadata (10 kB)
Downloading wandb-0.16.2-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   -- ------------------------------------- 0.1/2.2 MB 2.2 MB/s eta 0:00:01
   ------ --------------------------------- 0.4/2.2 MB 3.9 MB/s eta 0:00:01
   ------------ --------------------------- 0.7/2.2 MB 4.7 MB/s eta 0:00:01
   ---------------- ----------------------- 0.9/2.2 MB 4.8 MB/s eta 0:00:01
   --------------------- ------------------ 1.2/2.2 MB 4.9 MB/s eta 0:00:01
   -------------------------- ------------- 1.4/2.2 MB 5.0 MB/s eta 0:00:01
   ------------------------------- -------- 1.7/2.2 MB 5.1 MB/s eta 0:00:01
   ------------------------------------ --- 2.0

# Importing Libraries

In [12]:
import wandb

In [13]:
wandb.login()

True

In [14]:
import torch
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset

In [15]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [16]:
gc.collect()
torch.cuda.empty_cache()

In [17]:
model_name = "gpt2"
dataset_name = "ArtifactAI/arxiv-math-instruct-50k"

In [18]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [19]:
dataset = load_dataset(dataset_name, split='train')
dataset.to_pandas().head()

Unnamed: 0,question,answer
0,What structure is classified as a definite lie...,A definite Lie algebra is a Lie algebra equipp...
1,What type of laser is used to study infrared?,The type of laser used to study infrared is a ...
2,What type of detector detects photon arrival?,A photodetector or a photon detector is a type...
3,Can a qw be made shapeless?,"No, a qw cannot be made shapeless. ""qw"" is a c..."
4,Which of the following is the only finite widt...,The only finite width of quark gluon plasma ba...


In [20]:
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 50488
})

In [21]:
q = [len(i) for i in dataset['question']]
a = [len(i) for i in dataset['answer']]
max(q), max(a)

(229, 3816)

In [22]:
def prepare_datasets(example):
    example['prompt'] = f"""<|system|>\nYou are a intelligent chatbot and expertise in Mathematics.</s>\n<|user|>\n{example['question']}.\n<|assistant|>\n{example['answer']}"""
    return example

def tokenize_datasets(dataset):
    tokenized_dataset = dataset.map(
      lambda example: tokenizer(
          example['prompt'],
          truncation=True,
          max_length=512,
          ),
      batched=True,
      remove_columns=['prompt'])
    return tokenized_dataset
        

In [23]:
dataset = dataset.map(
    prepare_datasets, remove_columns=['question', 'answer']
)
dataset = dataset.shuffle(42).select(range(50488)).train_test_split(test_size=0.1, seed=42)

In [24]:
train_dataset = dataset['train']
test_dataset = dataset['test']


In [25]:
train_dataset

Dataset({
    features: ['prompt'],
    num_rows: 45439
})

In [26]:
test_dataset

Dataset({
    features: ['prompt'],
    num_rows: 5049
})

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [28]:
train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

In [29]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 45439
})

In [30]:
train_dataset['input_ids'][0]

[27,
 91,
 10057,
 91,
 29,
 198,
 1639,
 389,
 257,
 12661,
 8537,
 13645,
 290,
 13572,
 287,
 39448,
 25970,
 82,
 29,
 198,
 27,
 91,
 7220,
 91,
 29,
 198,
 2061,
 318,
 262,
 6376,
 44728,
 329,
 850,
 695,
 10257,
 291,
 12879,
 319,
 257,
 48048,
 30,
 13,
 198,
 27,
 91,
 562,
 10167,
 91,
 29,
 198,
 464,
 6376,
 44728,
 329,
 850,
 695,
 10257,
 291,
 12879,
 319,
 257,
 48048,
 318,
 257,
 18069,
 1255,
 326,
 3607,
 257,
 10451,
 329,
 262,
 6376,
 286,
 257,
 1728,
 1398,
 286,
 22577,
 12879,
 319,
 257,
 48048,
 13,
 22426,
 11,
 262,
 44728,
 8991,
 284,
 850,
 695,
 10257,
 291,
 12879,
 11,
 543,
 389,
 257,
 1398,
 286,
 22577,
 12879,
 326,
 389,
 19898,
 1022,
 48804,
 291,
 290,
 5328,
 78,
 695,
 10257,
 291,
 12879,
 13,
 220,
 198,
 198,
 464,
 6376,
 286,
 257,
 850,
 695,
 10257,
 291,
 10088,
 460,
 307,
 1807,
 286,
 355,
 257,
 3953,
 286,
 262,
 1271,
 286,
 9493,
 11458,
 4795,
 8136,
 284,
 663,
 3917,
 22577,
 16022,
 13,
 383,
 6376,
 44728,
 18436,


In [31]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [32]:
batch_size = 4
training_args = TrainingArguments(
    output_dir="./models/mathgpt2",
    gradient_accumulation_steps=batch_size,
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_strategy="no",
    save_total_limit=2,
    save_safetensors=False,
    fp16=True,
    learning_rate=2e-05,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="wandb"
)

training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

In [34]:
wandb.init(
    # set the wandb project where this run will be logged
    project="math-gpt2",
)

In [35]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,2.1935
1000,1.8649
1500,1.8092
2000,1.7716
2500,1.7483
3000,1.7179
3500,1.6869
4000,1.6736
4500,1.6704
5000,1.6613


TrainOutput(global_step=8520, training_loss=1.7190809706567038, metrics={'train_runtime': 17122.4383, 'train_samples_per_second': 7.961, 'train_steps_per_second': 0.498, 'total_flos': 1.7147324302848e+16, 'train_loss': 1.7190809706567038, 'epoch': 3.0})

In [36]:
MODEL_PATH = "Sharathhebbar24/math_gpt2"
HF_TOKEN = ""

In [37]:
tokenizer.push_to_hub(
    MODEL_PATH,
    token=HF_TOKEN
)



CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/math_gpt2/commit/fb96cd111a8321f4cdcda743edfc42e6496f53bf', commit_message='Upload tokenizer', commit_description='', oid='fb96cd111a8321f4cdcda743edfc42e6496f53bf', pr_url=None, pr_revision=None, pr_num=None)

In [38]:
model.push_to_hub(
    MODEL_PATH,
    token=HF_TOKEN
)



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/math_gpt2/commit/ba1a3b58d88ba2ff13efc6639859e81a6c2114c9', commit_message='Upload model', commit_description='', oid='ba1a3b58d88ba2ff13efc6639859e81a6c2114c9', pr_url=None, pr_revision=None, pr_num=None)