In [None]:
!pip install transformers[torch] datasets

Collecting transformers[torch]
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from tr



*  Dataset
*  Tokenizer
*  Model

In [None]:
!git clone https://github.com/tatsu-lab/stanford_alpaca

Cloning into 'stanford_alpaca'...
remote: Enumerating objects: 129, done.[K
remote: Total 129 (delta 0), reused 0 (delta 0), pack-reused 129[K
Receiving objects: 100% (129/129), 9.15 MiB | 8.55 MiB/s, done.
Resolving deltas: 100% (62/62), done.


In [None]:
!ls stanford_alpaca/

alpaca_data.json	 LICENSE	   train.py
assets			 model_card.md	   utils.py
configs			 prompt.txt	   WEIGHT_DIFF_LICENSE
DATA_LICENSE		 README.md	   weight_diff.py
datasheet.md		 requirements.txt
generate_instruction.py  seed_tasks.jsonl


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from rich import print

In [None]:
model_name = "distilgpt2"
#model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset
alpaca_dataset = load_dataset("json",data_files="stanford_alpaca/alpaca_data.json")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
shuffled_alpaca_dataset = alpaca_dataset.shuffle(seed=42)

In [None]:
prompt_template_with_input = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n \
### Instruction:\n \
{instruction}\n \
### Input:\n \
{input}\n \
### Response:\n \
{response}"

prompt_template_without_input = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n \
### Instruction:\n \
{instruction}\n \
### Response:\n \
{response}"

def add_content(example):
  if example["input"]:
    prompt = prompt_template_with_input.format(instruction=example["instruction"],input=example["input"],response=example["output"]) + tokenizer.eos_token
  else:
    prompt = prompt_template_without_input.format(instruction=example["instruction"],response=example["output"]) + tokenizer.eos_token
  return {"prompt" : prompt}

In [None]:
alpaca_train_prompt = shuffled_alpaca_dataset.map(add_content)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
print(alpaca_train_prompt["train"][10]["prompt"])

In [None]:
def tokenizer_function(example):
  outputs = tokenizer(
        example["prompt"],
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to("cuda")

  return outputs

In [None]:
train_tokenized_data = alpaca_train_prompt.map(tokenizer_function, batched=True)
train_tokenized_data = train_tokenized_data.remove_columns(alpaca_train_prompt["train"].column_names)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="ft-ds",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    #evaluation_strategy="steps",
    #eval_steps=1_0,
    logging_steps=1_00,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=1_00,
    fp16=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=train_tokenized_data["train"].select(range(0,20000)),
    # eval_dataset=train_tokenized_data["test"].select(range(0,100)),
)


In [148]:
trainer.train()

Step,Training Loss
100,0.6701
200,0.6798
300,0.7006
400,0.6902
500,0.724
600,0.7729
700,0.7284
800,0.7703
900,0.8236
1000,0.789


TrainOutput(global_step=3120, training_loss=0.6090948895002023, metrics={'train_runtime': 2857.2796, 'train_samples_per_second': 69.997, 'train_steps_per_second': 1.092, 'total_flos': 4.022572021501133e+16, 'train_loss': 0.6090948895002023, 'epoch': 9.98})

In [149]:
from transformers import pipeline
from rich import print
import warnings
warnings.filterwarnings("ignore")

In [150]:
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device="cuda"
)

In [151]:
prompt_template_with_input_no_response = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n \
### Instruction:\n \
{instruction}\n \
### Input:\n \
{input}\n \
### Response:\n"

prompt_template_without_input_no_repsonse = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n \
### Instruction:\n \
{instruction}\n \
### Response:\n \
"

In [152]:
def generate_prompt(example):
  if example["input"]:
    prompt = prompt_template_with_input_no_response.format(instruction=example["instruction"],input=example["input"])
  else:
    prompt = prompt_template_without_input_no_repsonse.format(instruction=example["instruction"])
  return prompt

In [163]:
index = 22000
input = generate_prompt(shuffled_alpaca_dataset["train"][index])
print(input)
print(add_content(shuffled_alpaca_dataset["train"][index])["prompt"])

In [164]:
print(pipe(input, eos_token_id=tokenizer.eos_token_id,pad_token_id=tokenizer.eos_token_id,max_new_tokens=200, do_sample=True,temperature=0.5,top_p=0.9)[0]["generated_text"].split(input)[1])

In [None]:
non_finetuned_pipe = pipeline(
    "text-generation", "gpt2", device="cuda"
)

In [None]:
print("Non fine-tuned Prediction: %s"%non_finetuned_pipe(input,eos_token_id=tokenizer.eos_token_id,max_new_tokens=2, do_sample=True,temperature=0.5)[0]["generated_text"].split(input)[1])
print("Label: %s"%label)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [158]:
model.save_pretrained("output_folder")