<a href="https://colab.research.google.com/github/RobinSmits/Dutch-LLMs/blob/main/PolyLM_1_7B_Alpaca_Clean_Dutch_Qlora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive
import os
from google.colab import drive
drive.mount('/content/drive')

# Set Folder to use...
WORK_DIR = '/content/drive/My Drive/LLM/PolyLM1_7BAlpacaCleanDutchQlora/'
os.makedirs(WORK_DIR, exist_ok = True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install required packages
!pip install -q accelerate
!pip install -q sentencepiece
!pip install -q bitsandbytes
!pip install -q transformers
!pip install -q peft
!pip install -q datasets

In [3]:
# Import Modules
from datasets import load_dataset, load_from_disk
from huggingface_hub import notebook_login
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
import torch
import transformers

In [4]:
# Huggingface Hub Loging
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Set Name Constants
model_name = "DAMO-NLP-MT/polylm-1.7b"
hf_model_name = 'polylm_1.7b_ft_alpaca_clean_dutch'

In [6]:
# Create Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          use_fast = False,
                                          legacy = False)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Max Length
MAX_LEN = 512

# Tokenizer Summary
print(tokenizer)

# Special Tokens
print(f'pad_token_id: {tokenizer.pad_token_id}')
print(f'bos_token_id: {tokenizer.bos_token_id}')
print(f'eos_token_id: {tokenizer.eos_token_id}')
print(f'pad_token_id: {tokenizer.decode(tokenizer.pad_token_id)}')
print(f'bos_token_id: {tokenizer.decode(tokenizer.bos_token_id)}')
print(f'eos_token_id: {tokenizer.decode(tokenizer.eos_token_id)}')

LlamaTokenizer(name_or_path='DAMO-NLP-MT/polylm-1.7b', vocab_size=256000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '</s>'}, clean_up_tokenization_spaces=False)
pad_token_id: 2
bos_token_id: 1
eos_token_id: 2
pad_token_id: </s>
bos_token_id: <s>
eos_token_id: </s>


In [7]:
# Create Config
config = AutoConfig.from_pretrained(model_name,
                                    use_cache = False)

# Summary
print(config)

GPT2Config {
  "_name_or_path": "DAMO-NLP-MT/polylm-1.7b",
  "activation_function": "gelu_fast",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.0,
  "bos_token_id": 255999,
  "embd_pdrop": 0.0,
  "eos_token_id": 255999,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 2048,
  "n_head": 16,
  "n_inner": 8192,
  "n_layer": 24,
  "n_positions": 2048,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.0,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.0,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "tokenizer_class": "AutoTokenizer",
  "transformers_version": "4.31.0",
  "use_cache": false,
  "vocab_size": 256000
}



In [8]:
# Create Model
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             config = config,
                                             device_map = {"":0},
                                             quantization_config = BitsAndBytesConfig(load_in_4bit = True,
                                                                                      bnb_4bit_use_double_quant = True,
                                                                                      bnb_4bit_quant_type = 'nf4',
                                                                                      bnb_4bit_compute_dtype = torch.bfloat16))
# Enable Gradient Checkpointing
model.gradient_checkpointing_enable()

# Show Model Summary
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(256000, 2048)
    (wpe): Embedding(2048, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=2048, out_features=6144, bias=True)
          (c_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (c_proj): Linear4bit(in_features=8192, out_features=2048, bias=True)
          (act): FastGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, eleme

In [9]:
# Create LoRA config
loraconfig = LoraConfig(r = 8,
                        lora_alpha = 16,
                        lora_dropout = 0.05,
                        bias = 'none',
                        task_type = TaskType.CAUSAL_LM,
                        fan_in_fan_out = True)

# Prep for Training
model = prepare_model_for_kbit_training(model)

# Create LoRA Model
model = get_peft_model(model, loraconfig)
model.print_trainable_parameters()

# Show Model Summary
print(model)

trainable params: 1,572,864 || all params: 1,134,678,016 || trainable%: 0.13861764992545691
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(256000, 2048)
        (wpe): Embedding(2048, 2048)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPT2Block(
            (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear4bit(
                in_features=2048, out_features=6144, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=6144, bias=False)
                )
         

In [10]:
"""
# Alpaca Translated
datasets = load_dataset('BramVanroy/alpaca-cleaned-dutch')

# Summary
print(datasets)

# Validation size
VAL_SIZE = 2048

# Split in Train and Val datasets
dataset_split = datasets["train"].train_test_split(test_size = VAL_SIZE,
                                                   shuffle = True,
                                                   seed = 42)

# Train and Val datasets
train_data = dataset_split["train"]
val_data = dataset_split["test"]

# Save for later use
val_data.save_to_disk(f'{WORK_DIR}val_data')
train_data.save_to_disk(f'{WORK_DIR}train_data')
"""

# Load Train and Validation sets from earlier training run...use code above to generate new train and val datasets.
val_data = load_from_disk(f'{WORK_DIR}val_data')
train_data = load_from_disk(f'{WORK_DIR}train_data')

In [11]:
# Model Training Prompt
def generate_training_prompt(data_point):
    if data_point["input"]:
        return f"""
### Instructie:
{data_point["instruction"]}

### Invoer:
{data_point["input"]}

### Antwoord:
{data_point["output"]}"""
    else:
        return f"""
### Instructie:
{data_point["instruction"]}

### Antwoord:
{data_point["output"]}"""

In [12]:
# Plot some samples
for index, item_data in enumerate(train_data):
    print('\n=======================================================================')
    print(generate_training_prompt(item_data))

    if index > 4:
        break



### Instructie:
Maak een zin met 5 uitdrukkingen.

### Antwoord:
Hij was zo boos dat hij stond te trappelen, op het verkeerde spoor zat, een appeltje te schillen had, te vroeg was begonnen, en zijn woorden moest inslikken.


### Instructie:
Voeg 3 interessante feiten over Colombia toe.

### Antwoord:
1. Colombia heeft na Brazilië de op één na grootste biodiversiteit ter wereld, met meer dan 79.000 diersoorten en 40.000 plantensoorten.
2. Colombia produceert meer koffie dan welk ander land ter wereld dan ook.
3. Colombia is het enige Zuid-Amerikaanse land met een kustlijn aan zowel de Stille Oceaan als de Caribische Zee.


### Instructie:
Geef een voorbeeld van een gebruikstoepassing voor Machine Learning

### Antwoord:
Een voorbeeld van een gebruikstoepassing voor Machine Learning is een spamfilteringsysteem. Het maakt gebruik van ML-algoritmen om spam-e-mails te detecteren en uit de inbox van een gebruiker te filteren.


### Instructie:
Herschrijf de zin om de wezelwoorden te vermij

In [13]:
# Custom Tokenize method
def tokenize(prompt):
    # Add EOS token
    prompt += tokenizer.eos_token

    # Tokenize
    tokenized_prompt = tokenizer.encode_plus(prompt,
                                             truncation = True,
                                             max_length = MAX_LEN,
                                             add_special_tokens = True)

    return tokenized_prompt

# Tokenize Train Data
train_data = train_data.shuffle().map(lambda x: tokenize(generate_training_prompt(x)))

# Tokenize Val Data
val_data = val_data.shuffle().map(lambda x: tokenize(generate_training_prompt(x)))

Map:   0%|          | 0/49664 [00:00<?, ? examples/s]

Map:   0%|          | 0/2048 [00:00<?, ? examples/s]

In [14]:
# Show tokenized samples
for index, item in enumerate(train_data):
    print(item)

    if index > 2:
        break

{'id': 49818, 'instruction': 'Doe een voorstel voor een plausibele oplossing voor het gegeven probleem.', 'input': 'Overbevolking leidt tot een verhoogde druk op hulpbronnen', 'output': 'Een mogelijke oplossing voor het probleem van overbevolking is om mensen aan te moedigen duurzaam te leven en hun verbruik van hulpbronnen te verminderen. Dit omvat het verminderen van afval en recycling, het gebruik van hernieuwbare energiebronnen en het investeren in voorlichting en gezondheidszorg voor gezinsplanning.', 'input_ids': [460, 213, 88334, 196908, 1725, 475, 213, 8053, 544, 1648, 460, 199073, 3222, 1648, 230883, 20769, 121096, 3222, 1870, 159984, 110161, 461, 213, 213, 88334, 643, 122953, 475, 213, 29458, 3437, 151436, 719, 119869, 504, 465, 504, 1648, 2071, 145010, 1294, 59684, 1772, 121073, 4800, 122133, 213, 213, 88334, 47021, 163437, 475, 213, 1251, 630, 229423, 121096, 3222, 1870, 110161, 1337, 676, 3437, 151436, 719, 478, 4752, 32317, 4924, 1374, 15059, 793, 14695, 217942, 1374, 579

In [15]:
# Set Steps
eval_steps = 128
logging_steps = 16

# Config Trainer
trainer = transformers.Trainer(model = model,
                               train_dataset = train_data,
                               eval_dataset = val_data,
                               args = transformers.TrainingArguments(num_train_epochs = 2,
                                                                     learning_rate = 1.0e-4,
                                                                     evaluation_strategy = "steps",
                                                                     logging_steps = logging_steps,
                                                                     save_strategy = "epoch",
                                                                     eval_steps = eval_steps,
                                                                     save_total_limit = 3,
                                                                     per_device_train_batch_size = 8,
                                                                     per_device_eval_batch_size = 8,
                                                                     gradient_accumulation_steps = 8,
                                                                     warmup_steps = 64,
                                                                     bf16 = True,
                                                                     output_dir = hf_model_name,
                                                                     hub_model_id = hf_model_name,
                                                                     push_to_hub = True,
                                                                     hub_private_repo = True,
                                                                     optim = "paged_adamw_8bit",
                                                                     report_to = "tensorboard"),
                               data_collator = transformers.DataCollatorForLanguageModeling(tokenizer,
                                                                                            mlm = False))

# Perform Training
trainer.train()

# Push model to hub
trainer.push_to_hub()

# Push tokenizer to hub
tokenizer.push_to_hub(hf_model_name)

/content/polylm_1.7b_ft_alpaca_clean_dutch is already a clone of https://huggingface.co/robinsmits/polylm_1.7b_ft_alpaca_clean_dutch. Make sure you pull the latest changes with `repo.git_pull()`.


Step,Training Loss,Validation Loss
128,2.1248,2.112861
256,2.0512,2.034694
384,1.9983,1.994805
512,1.9557,1.9655
640,1.9583,1.938586
768,1.916,1.917731
896,1.8671,1.901885
1024,1.8626,1.888485
1152,1.8321,1.876165
1280,1.8596,1.863079


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file adapter_model.bin:   1%|          | 32.0k/6.02M [00:00<?, ?B/s]

Upload file runs/Jul24_13-52-56_10a640161224/events.out.tfevents.1690206779.10a640161224.22353.0: 100%|#######…

To https://huggingface.co/robinsmits/polylm_1.7b_ft_alpaca_clean_dutch
   c0fde9b..fc52456  main -> main

   c0fde9b..fc52456  main -> main

To https://huggingface.co/robinsmits/polylm_1.7b_ft_alpaca_clean_dutch
   fc52456..5eb7542  main -> main

   fc52456..5eb7542  main -> main



CommitInfo(commit_url='https://huggingface.co/robinsmits/polylm_1.7b_ft_alpaca_clean_dutch/commit/0cca57cd7f842deecae5edd4f00cde674d22bafa', commit_message='Upload tokenizer', commit_description='', oid='0cca57cd7f842deecae5edd4f00cde674d22bafa', pr_url=None, pr_revision=None, pr_num=None)