In [1]:
from dotenv import load_dotenv
import os

# Load environment variables from .env.local
load_dotenv('.env.local')

# Now you can access your token
hf_token = os.getenv("HF_TOKEN")  # Replace with your variable name

In [2]:
from huggingface_hub import login

login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Load ELI5 dataset

Start by loading the first 5000 examples from the ELI5-Category dataset with the 🤗 Datasets library. This’ll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.

In [3]:
from datasets import load_dataset

eli5 = load_dataset("sentence-transformers/eli5", split="train[:5000]", trust_remote_code=True)

In [4]:
eli5 = eli5.train_test_split(test_size=0.2)

In [5]:
eli5["train"][0]

{'question': "How is it someone can do 100 squats but can't run but a few miles?",
 'answer': "ELI5: bulldozers can lift tons of weight but can't go fast. They are built for one thing and they do other things not as well. Race cars can go fast but can't lift tons."}

## Preprocess

The next step is to load a DistilGPT2 tokenizer to process the text subfield:

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [7]:
def format_instruct(example):
    prompt = f"### Instruction:\n{example['question']}\n\n### Response:\n{example['answer']}"
    return {"text": prompt}

In [8]:
dataset = eli5.map(format_instruct)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
dataset["train"][5]

{'question': "Why didn't the U.S. include the release of prisoners in the new nuclear deal with Iran",
 'answer': 'Because that would have given leverage to Iran. By taking more prisoners and negotiating for their release too, they could have tried to extract more favorable terms in other parts of the agreement. By not including prisoner release as part of the negotiations, the U.S. removed that potential negotiating card for them to play, forcing them to negotiate only on nuclear research and sanctions and nothing else.',
 'text': "### Instruction:\nWhy didn't the U.S. include the release of prisoners in the new nuclear deal with Iran\n\n### Response:\nBecause that would have given leverage to Iran. By taking more prisoners and negotiating for their release too, they could have tried to extract more favorable terms in other parts of the agreement. By not including prisoner release as part of the negotiations, the U.S. removed that potential negotiating card for them to play, forcing t

In [10]:
tokenizer.pad_token = tokenizer.eos_token  # required for batching

In [11]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForLanguageModeling, AutoModelForCausalLM

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [13]:
!pip install 'accelerate>=0.26.0'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./instruct-chatbot",
    # evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,3.5472
1000,3.4303
1500,3.341
2000,3.3082
2500,3.2822
3000,3.2549


TrainOutput(global_step=3000, training_loss=3.3606617024739585, metrics={'train_runtime': 3823.511, 'train_samples_per_second': 3.138, 'train_steps_per_second': 0.785, 'total_flos': 1567780503552000.0, 'train_loss': 3.3606617024739585, 'epoch': 3.0})

In [15]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "### Instruction:\nHow do airplanes fly?\n\n### Response:\n"
output = pipe(prompt, max_new_tokens=100, do_sample=True, top_p=0.95, temperature=0.7)

print(output[0]["generated_text"])

Device set to use mps:0
  test_elements = torch.tensor(test_elements)


### Instruction:
How do airplanes fly?

### Response:
Airplane is the same type of plane as the engine, and it's the same engine that holds it and the engine produces a lot of power. The plane is the same size as the engine. This makes it a little more accurate. In a plane, it is an engine that can have a lot of torque, and the engine is the same size. In a plane, it has the same weight, but it has the same engine. The engine is the same size. The engine has the
