In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msharathhebbar[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
wandb.init(
    # set the wandb project where this run will be logged
    project="math_gpt2_sft",
)

In [None]:

!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl

In [3]:
import torch
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from trl import SFTTrainer

from datasets import load_dataset



In [4]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
model_name = "Sharathhebbar24/math_gpt2"
dataset_name = "meta-math/MetaMathQA"


In [6]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [7]:
gc.collect()
torch.cuda.empty_cache()

In [8]:
dataset = load_dataset(dataset_name, split='train')
dataset.to_pandas().head()

Unnamed: 0,response,type,original_question,query
0,"The distance between two points $(x_1,y_1)$ an...",MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...
1,"Each player requires a $25 jersey, a $15.20 pa...",GSM_Rephrased,The treasurer of a football team must buy equi...,What is the total cost of purchasing equipment...
2,"To solve this problem, we need to determine th...",GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...
3,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.
4,"We know that every 30 minutes, a machine produ...",GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a..."


In [9]:
dataset

Dataset({
    features: ['response', 'type', 'original_question', 'query'],
    num_rows: 395000
})

In [10]:
def prepare_datasets(example):
    example['prompt'] = f"""<|system|>
    You are a intelligent chatbot and expertise in Mathematics.</s>
    <|user|>
    {example['query']}.
    <|assistant|>
    {example['response']}"""
    return example

def tokenize_datasets(dataset):
    tokenized_dataset = dataset.map(
      lambda example: tokenizer(
          example['prompt'],
          truncation=True,
          max_length=512,
          ),
      batched=True,
      remove_columns=['prompt'])
    return tokenized_dataset
        

In [11]:
dataset = dataset.map(
    prepare_datasets, remove_columns=['query', 'response', "original_question", "type"]
)
dataset = dataset.shuffle(42).select(range(395000)).train_test_split(test_size=0.1, seed=42)

In [12]:
train_dataset = dataset['train']
test_dataset = dataset['test']


In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Load the model and prepare it to be fine-tuned with QLoRA.

In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='Sharathhebbar24/math_gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [15]:
batch_size = 2
max_steps = 100
training_arguments = TrainingArguments(
    output_dir="./models/mathgpt2sft/",
    gradient_accumulation_steps=batch_size,
    evaluation_strategy="steps",
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    log_level="debug",
    save_strategy="no",
    save_total_limit=2,
    save_safetensors=False,
    fp16=True,
    logging_steps=50,
    learning_rate=2e-5,
    eval_steps=50,
    max_steps=max_steps,
    warmup_steps=30,
    lr_scheduler_type="cosine",
)

Start training:

In [16]:
trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        dataset_text_field="prompt",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()

Map:   0%|          | 0/39500 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 4
***** Running training *****
  Num examples = 355,500
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Training with DataParallel so batch size has been adjusted to: 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 100
  Number of trainable parameters = 124,439,808
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,1.9567,1.55452
100,1.5767,1.457796


***** Running Evaluation *****
  Num examples = 39500
  Batch size = 4
***** Running Evaluation *****
  Num examples = 39500
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=1.7666657257080078, metrics={'train_runtime': 2866.1971, 'train_samples_per_second': 0.279, 'train_steps_per_second': 0.035, 'total_flos': 151453435392000.0, 'train_loss': 1.7666657257080078, 'epoch': 0.0})

In [None]:
# new_model = "Sharathhebbar24/math_gpt2_sft"
# model.push_to_hub(new_model, use_auth_token=True, use_temp_dir=False)
# tokenizer.push_to_hub(new_model, use_auth_token=True, use_temp_dir=False)

In [17]:
MODEL_PATH = "Sharathhebbar24/math_gpt2_sft"
HF_TOKEN = "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"

In [18]:
tokenizer.push_to_hub(
    MODEL_PATH,
    token=HF_TOKEN
)

tokenizer config file saved in /tmp/tmp8zaj2cii/tokenizer_config.json
Special tokens file saved in /tmp/tmp8zaj2cii/special_tokens_map.json
Uploading the following files to Sharathhebbar24/math_gpt2_sft: merges.txt,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json


CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/math_gpt2_sft/commit/a3f85025df3dc52882fe247e041cacd5110aac25', commit_message='Upload tokenizer', commit_description='', oid='a3f85025df3dc52882fe247e041cacd5110aac25', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
model.push_to_hub(
    MODEL_PATH,
    token=HF_TOKEN
)

Configuration saved in /tmp/tmp286ldlnt/config.json
Configuration saved in /tmp/tmp286ldlnt/generation_config.json
Model weights saved in /tmp/tmp286ldlnt/pytorch_model.bin
Uploading the following files to Sharathhebbar24/math_gpt2_sft: generation_config.json,config.json,model.safetensors


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/math_gpt2_sft/commit/6ce82684e3332dcbc73a00be4382a31986aa8442', commit_message='Upload model', commit_description='', oid='6ce82684e3332dcbc73a00be4382a31986aa8442', pr_url=None, pr_revision=None, pr_num=None)