<a href="https://colab.research.google.com/github/RobinSmits/Dutch-LLMs/blob/main/PolyLM_13B_Alpaca_Clean_Dutch_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive
import os
from google.colab import drive
drive.mount('/content/drive')

# Set Folder to use...
WORK_DIR = '/content/drive/My Drive/LLM/PolyLM13BAlpacaCleanDutchQlora/'
os.makedirs(WORK_DIR, exist_ok = True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install required packages
!pip install -q accelerate
!pip install -q sentencepiece
!pip install -q bitsandbytes
!pip install -q transformers
!pip install -q peft
!pip install -q datasets

In [3]:
# Import Modules
from datasets import load_dataset, load_from_disk
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer

In [4]:
# Set Model Name Constant
model_name = "robinsmits/polylm_13b_ft_alpaca_clean_dutch"

In [5]:
# Create Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          use_fast = False,
                                          legacy = False)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Tokenizer Summary
print(tokenizer)

# Special Tokens
print(f'pad_token_id: {tokenizer.pad_token_id}')
print(f'bos_token_id: {tokenizer.bos_token_id}')
print(f'eos_token_id: {tokenizer.eos_token_id}')
print(f'pad_token_id: {tokenizer.decode(tokenizer.pad_token_id)}')
print(f'bos_token_id: {tokenizer.decode(tokenizer.bos_token_id)}')
print(f'eos_token_id: {tokenizer.decode(tokenizer.eos_token_id)}')

LlamaTokenizer(name_or_path='robinsmits/polylm_13b_ft_alpaca_clean_dutch', vocab_size=256000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '</s>'}, clean_up_tokenization_spaces=False)
pad_token_id: 2
bos_token_id: 1
eos_token_id: 2
pad_token_id: </s>
bos_token_id: <s>
eos_token_id: </s>


In [6]:
# Create Model
model = AutoPeftModelForCausalLM.from_pretrained(model_name,
                                                 device_map = "auto",
                                                 load_in_4bit = True,
                                                 torch_dtype = torch.bfloat16)

# Summary
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(256000, 5120)
        (wpe): Embedding(2048, 5120)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-39): 40 x GPT2Block(
            (ln_1): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear4bit(
                in_features=5120, out_features=15360, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=15360, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDic

I'am using the same validation set as during training of the notebook.

This allows for some further analysis in this Inference notebook without data leakage.

The required train and validation files are stored in my github account. Notice that the original code to generate the train and validation dataset is in the commented out section.

In [7]:
"""
# Alpaca Translated
datasets = load_dataset('BramVanroy/alpaca-cleaned-dutch')

# Summary
print(datasets)

# Validation size
VAL_SIZE = 2048

# Split in Train and Val datasets
dataset_split = datasets["train"].train_test_split(test_size = VAL_SIZE,
                                                   shuffle = True,
                                                   seed = 42)

# Train and Val datasets
train_data = dataset_split["train"]
val_data = dataset_split["test"]

# Save for later use
val_data.save_to_disk(f'{WORK_DIR}val_data')
train_data.save_to_disk(f'{WORK_DIR}train_data')
"""

# Load Validation set from earlier training run...use code above to generate new train and val datasets if required
val_data = load_from_disk(f'{WORK_DIR}val_data')

In [8]:
def generate_eval_prompt(item):
    if item["input"]:
        return f"""
### Instructie:
{item["instruction"]}

### Invoer:
{item["input"]}

### Antwoord:"""
    else:
        return f"""
### Instructie:
{item["instruction"]}

### Antwoord:"""

In [9]:
def generate(instruction, input = None):
    # Generate Prompt
    prompt = generate_eval_prompt({'instruction': instruction, 'input': input})

    # Tokenize
    inputs = tokenizer(prompt, return_tensors = "pt")
    input_ids = inputs.input_ids.cuda()
    attention_masks = inputs.attention_mask.cuda()

    # Generate output
    outputs = model.generate(input_ids = input_ids,
                             attention_mask = attention_masks,
                             max_new_tokens = 128,
                             do_sample = True,
                             top_p = 0.85,
                             top_k = 50,
                             temperature = 0.5,
                             repetition_penalty = 1.2,
                             length_penalty = -1.0,
                             num_return_sequences = 1,
                             pad_token_id = tokenizer.eos_token_id,
                             forced_eos_token_id = tokenizer.eos_token_id)

    # Decode output
    generated_output = tokenizer.decode(outputs[0], skip_special_tokens = True)

    # Print prompt and answer extracted from generated output
    print(prompt)
    if prompt in generated_output:
        answer = generated_output.split(prompt)[1]
        print(answer)
    else:
        print('<< NO ANSWER GENERATED >>')

# Validation Loop
counter = 0
for item in val_data:
    print(f'\n\n=== Voorbeeld: {counter} ======================================================================================')
    generate(item['instruction'], item['input'])

    counter += 1
    if counter > 50:
        break




### Instructie:
Wat zijn de drie belangrijkste softwareonderdelen die worden gebruikt bij webontwikkeling?

### Antwoord:

De drie belangrijkste softwareonderdelen die worden gebruikt bij webontwikkeling, zijn HTML (HyperText Markup Language), CSS (Cascading Style Sheets) en JavaScript. Deze onderdelen stellen gebruikers in staat om inhoud op een website te creëren of aanpassen met behulp van codering. Bovendien kunnen ze interactieve elementen zoals animatie, video's en audio-opnames toevoegen aan websites. HTML is het meest voorkomende onderdeel omdat deze de basis vormt voor alle andere componenten. Het stelt ontwikkelaars in staat om tekst en afbeeldingen op hun pagina's weer te geven door gebruik te maken van markup tags



### Instructie:
Genereer een wachtwoord dat aan de volgende eisen voldoet: minimaal 8 tekens, één hoofdletter, één kleine letter, één cijfer en één speciaal teken.

### Antwoord:

mYgRtTwFc7bWj4f5#192036!@

Het wachtwoord is "mYgRtTwFc7bWj4f5" met daarachter