In [1]:
from transformers import LlamaTokenizer, LlamaForCausalLM

from datasets import *
import numpy as np

import torch
import ast


SEED = 42

In [2]:
model_name = "decapoda-research/llama-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)#.to("cuda") # not enough memory on GPU

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [3]:
# Load the dataset from file and split it into train and test datasets
data = load_dataset('csv', data_files="../../data/clean/sustainability-report-2020-squad-format.csv", delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=SEED)

Found cached dataset csv (C:/Users/Luka/.cache/huggingface/datasets/csv/default-d8382661cd597e83/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c261d5613d28d856.arrow and C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e61829c1e4a24b65.arrow


In [4]:
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
data["test"] = data["test"].map(lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["test"] = data["test"].remove_columns(["text", "answer_start"])

data["train"] = data["train"].map(lambda example: ast.literal_eval(example["answers"]))
data["train"] = data["train"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {"text": example["text"], "answer_start": example["answer_start"]}})
data["train"] = data["train"].remove_columns(["text", "answer_start"])

Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0b15501cefb41ff7.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6c4455904f60e079.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-65eb14b3b79cbed9.arrow
Loading cached processed dataset at C:\Users\Luka\.cache\huggingface\datasets\csv\default-d8382661cd597e83\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-559c811f459458f4.arrow


In [5]:
tokenizer

LlamaTokenizer(name_or_path='decapoda-research/llama-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=False)

In [6]:
# # process the examples in input and target text format and the eos token at the end 
# def add_eos_to_examples(example):
#     example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
#     example['target_text'] = '%s </s>' % example['answers']['text'][0]
#     return example

# # tokenize the examples
# def convert_to_features(examples):
#     model_inputs = tokenizer(examples['input_text'])
    
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(examples['target_text'])
#         # temp = np.array(labels["input_ids"])
#         # temp[temp == tokenizer.pad_token_id] = -100
#         # labels["input_ids"] = temp.tolist()

#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

In [7]:
train_data, test_data = data["train"], data["test"]

# train_data = train_data.map(add_eos_to_examples, load_from_cache_file=False)
# train_data = train_data.map(convert_to_features, batched=True, load_from_cache_file=False)

# test_data = test_data.map(add_eos_to_examples, load_from_cache_file=False)
# test_data = test_data.map(convert_to_features, batched=True, load_from_cache_file=False)

In [8]:
train_data

Dataset({
    features: ['question', 'context', 'answers', 'id'],
    num_rows: 129
})

In [9]:
train_data[0]

{'question': 'How does the Bank prevent or manage cyber risks arising from the increased volume of work from home?',
 'context': 'nages cyber risks arising from the increased volumeof work from home through implemented measures, some of which areexplained below, namelyPromoting Freedom of Expr',
 'answers': {'answer_start': [69], 'text': ['through implemented measures']},
 'id': 132}

In [10]:
def get_answer(question, context):
    input_text = "You got the information that: %s Answer the following question: %s" % (context, question)
    features = tokenizer(input_text, return_tensors='pt')

    output = model.generate(features.input_ids, max_length=128)

    # return tokenizer.decode(output)
    return tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [15]:
answers = [temp["answers"]["text"][0] for temp in test_data]
llama_answers = [get_answer(question, context) for question, context in zip(test_data["question"][:1], test_data["context"][:1])]
# [(question, context) for question, context in zip(test_data["question"][:1], test_data["context"][:1])]

[('How many employees departed from NLB Group in 2020?',
  'NLB Group In total, 382 employeesdeparted from NLB Group in 2020.In total, 162 employees de')]

In [12]:
answers

['382',
 'green/sustainability financing',
 '1',
 '2021',
 'law, collectiveagreements and internal regulations',
 'Social and EnvironmentalPolicy',
 'equal opportunities, justice',
 'a visit by Santa Claus',
 '2020',
 'three',
 '30th December 2020',
 '2020',
 '6.7%,',
 'Komercijalna Banka a.d. Beograd',
 '2020',
 '2021',
 'Bogdan Darmanović',
 '4,769',
 'World Institute forSustainability and Ethics in Rising Economies',
 '2018',
 '2020',
 '23 million EUR',
 '307',
 'a higher quality of life of the wider society',
 'EUR 340 million',
 'Retail Banking in Slovenia, Corporate Bankingin Slovenia, and Strategic Foreign Markets',
 '30.12.2020',
 '69% women and 31% men',
 'More than 200',
 '31%',
 'corruption and bribery',
 '307',
 '2.11million',
 '17,297',
 '58%',
 '45',
 '2017',
 'annually',
 'CRS',
 '2019',
 '97%',
 'EUR 340 million',
 '94',
 'Beograd',
 '2,914',
 '4 Sep',
 '2%.',
 '69% women and 31% men',
 'to invest in a systematicdevelopment of employees',
 '17,295',
 'by e-mail, via the

In [13]:
llama_answers

['You got the information that: NLB Group In total, 382 employeesdeparted from NLB Group in 2020.In total, 162 employees de Answer the following question: How many employees departed from NLB Group in 2020? 2. How many employees departed from NLB Group in 2020? 3. How many employees departed from NLB Group in 2020? 4. How many employees departed from NLB Group in 2020? 5.']