<a href="https://colab.research.google.com/github/RobinSmits/Dutch-LLMs/blob/main/Open_Llama_7B_Alpaca_Clean_Dutch_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive
import os
from google.colab import drive
drive.mount('/content/drive')

# Set Google Drive Folder to use...Copy Val_Data folder from Github into it to be able to reproduce the results.
WORK_DIR = '/content/drive/My Drive/LLM/OpenLlama7BAlpacaCleanDutchQlora/'
os.makedirs(WORK_DIR, exist_ok = True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install required packages
!pip install -q accelerate
!pip install -q sentencepiece
!pip install -q bitsandbytes
!pip install -q transformers
!pip install -q peft
!pip install -q datasets

In [3]:
# Import Modules
from datasets import load_dataset, load_from_disk
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer

In [4]:
# Set Model Name Constant
model_name = "robinsmits/open_llama_7b_alpaca_clean_dutch_qlora"

In [5]:
# Create Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          use_fast = False,
                                          legacy = False,
                                          add_eos_token = True)
tokenizer.pad_token_id = 0

# Max Length
MAX_LEN = 512

# Tokenizer Summary
print(tokenizer)

# Special Tokens
print(f'pad_token_id: {tokenizer.pad_token_id}')
print(f'bos_token_id: {tokenizer.bos_token_id}')
print(f'eos_token_id: {tokenizer.eos_token_id}')
print(f'pad_token_id: {tokenizer.decode(tokenizer.pad_token_id)}')
print(f'bos_token_id: {tokenizer.decode(tokenizer.bos_token_id)}')
print(f'eos_token_id: {tokenizer.decode(tokenizer.eos_token_id)}')

LlamaTokenizer(name_or_path='robinsmits/open_llama_7b_alpaca_clean_dutch_qlora', vocab_size=32000, model_max_length=2048, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False)
pad_token_id: 0
bos_token_id: 1
eos_token_id: 2
pad_token_id: <unk>
bos_token_id: <s>
eos_token_id: </s>


In [6]:
# Create Model
model = AutoPeftModelForCausalLM.from_pretrained(model_name,
                                                 device_map = "auto",
                                                 load_in_4bit = True,
                                                 torch_dtype = torch.bfloat16)

# Summary
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v

I'am using the same validation set as during training of the notebook.

This allows for some further analysis in this Inference notebook without data leakage.

The required train and validation files are stored in my github account. Notice that the original code to generate the train and validation dataset is in the commented out section.

In [7]:
"""
# Alpaca Translated
datasets = load_dataset('BramVanroy/alpaca-cleaned-dutch')

# Summary
print(datasets)

# Validation size
VAL_SIZE = 2048

# Split in Train and Val datasets
dataset_split = datasets["train"].train_test_split(test_size = VAL_SIZE,
                                                   shuffle = True,
                                                   seed = 42)

# Train and Val datasets
train_data = dataset_split["train"]
val_data = dataset_split["test"]

# Save for later use
val_data.save_to_disk(f'{WORK_DIR}val_data')
train_data.save_to_disk(f'{WORK_DIR}train_data')
"""

# Load Validation set from earlier training run...use code above to generate new train and val datasets if required
val_data = load_from_disk(f'{WORK_DIR}val_data')

In [8]:
def generate_eval_prompt(item):
    if item["input"]:
        return f"""
### Instructie:
{item["instruction"]}

### Invoer:
{item["input"]}

### Antwoord:"""
    else:
        return f"""
### Instructie:
{item["instruction"]}

### Antwoord:"""

In [9]:
def generate(instruction, input = None):
    # Generate Prompt
    prompt = generate_eval_prompt({'instruction': instruction, 'input': input})

    # Tokenize
    input_ids = tokenizer(prompt, return_tensors = "pt", truncation = True).input_ids.cuda()

    # Generate output
    outputs = model.generate(input_ids = input_ids,
                             max_new_tokens = MAX_LEN,
                             num_beams = 2,
                             early_stopping = True,
                             forced_eos_token_id = tokenizer.eos_token_id)

    # Decode output
    generated_output = tokenizer.decode(outputs[0], skip_special_tokens = True)

    # Print prompt and answer extracted from generated output
    print(prompt)
    if prompt in generated_output:
        answer = generated_output.split(prompt)[1]
        print(answer)
    else:
        print('<< NO ANSWER GENERATED >>')

# Validation Loop
for index, item in enumerate(val_data):
    print(f'\n\n=== Voorbeeld: {index} ======================================================================================')
    generate(item['instruction'], item['input'])

    if index > 50:
        break




### Instructie:
Wat zijn de drie belangrijkste softwareonderdelen die worden gebruikt bij webontwikkeling?

### Antwoord:
 </br>
De drie belangrijkste softwareonderdelen die worden gebruikt bij webontwikkeling zijn HTML, CSS en JavaScript. HTML (HyperText Markup Language) wordt gebruikt om tekst, afbeeldingen en links te creëren. CSS (Cascading Style Sheets) wordt gebruikt om tekst, afbeeldingen en lay-outs te veranderen. JavaScript wordt gebruikt om interactieve websites te creëren.



### Instructie:
Genereer een wachtwoord dat aan de volgende eisen voldoet: minimaal 8 tekens, één hoofdletter, één kleine letter, één cijfer en één speciaal teken.

### Antwoord:
 </br>
<b>1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456