In [1]:
import torch
import transformers


In [2]:
## define run name
run_name = "finalTraining_v1"


# define model for tokenizer
model_name = "codellama/CodeLlama-7b-hf"


# model save path
model_save_path = "./models/" + run_name + "/"


In [3]:
# load quantization config for 4bit quantization -> must be same as training
quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)

# load model from model_save_path with quantization config
model = transformers.AutoModelForCausalLM.from_pretrained(model_save_path, quantization_config=quantization_config, low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# add pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [5]:
prompt = "<START> { \"@class\" : \"nitrox.dlc.mirror.model.FieldModel\", "

In [6]:
text = "<END>"
# tokenize text
test = tokenizer.encode(text, return_tensors="pt")
test

tensor([[    1,   529, 11794, 29958]])

In [7]:
test = [   529, 11794, 29958]

# decode
tokenizer.decode(test)

'<END>'

In [8]:
def generate_nitrox_json(model, tokenizer, prompt, use_custom_eos=False, custom_eos_token="}", max_length=1024, confidence_threshold=0.01):


    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Initialize the output as the input
    output = input_ids

    counter = 0
    
    # Loop until the end token is generated or counter is at max_length
    while True:
        # Predict the probabilities of the next token
        with torch.no_grad():
            outputs = model(output)
        predictions = outputs.logits[:, -1, :]
        probabilities = torch.nn.functional.softmax(predictions, dim=-1)

        # Get the token with the highest probability
        max_prob, max_token_id = torch.max(probabilities, dim=-1)

        # Check if the confidence is over the threshold
        if max_prob.item() < confidence_threshold:
            break

        # Append the token to the output
        output = torch.cat([output, max_token_id.unsqueeze(0)], dim=-1)

        if len(output[0]) > 3 + len(custom_eos_token):
            evtl_end = tokenizer.decode(output[0][-3:], skip_special_tokens=True)
            if use_custom_eos:
                if custom_eos_token in evtl_end:
                    break
            # check for <EOS> in evtl_end
            if "<EOS>" in evtl_end:
                break
        
        # decode every 1000 iterations and print output
        if len(output[0]) % 1000 == 0:
            print(tokenizer.decode(output[0], skip_special_tokens=True))
            print("Length of output: ", len(output[0]))
            print("Max prob: ", max_prob.item())
            print("Max token: ", max_token_id.item())
            print("Counter: ", counter)
            print("")

        counter += 1
        if counter >= max_length:
            break

        
    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # delete <START> from generated text
    generated_text = generated_text.replace("<START>", "")

    # cleanup
    del output
    del input_ids
    torch.cuda.empty_cache()

    return generated_text





In [9]:
# # Test function here:
# # generate text
# generated_text = generate_nitrox_json(model, tokenizer, prompt, use_custom_eos=True, custom_eos_token="}", max_length=1024, confidence_threshold=0.01)

# generated_text


In [10]:
"""ENTITY --> nitrox.dlc.mirror.model.EntityModel
VALUE_OBJECT --> nitrox.dlc.mirror.model.ValueObjectModel
AGGREGATE_ROOT --> nitrox.dlc.mirror.model.AggregateRootModel
IDENTITY --> nitrox.dlc.mirror.model.IdentityModel
ENUM --> nitrox.dlc.mirror.model.EnumModel
DOMAIN_SERVICE --> nitrox.dlc.mirror.model.DomainServiceModel
REPOSITORY --> nitrox.dlc.mirror.model.RepositoryModel
APPLICATION_SERVICE --> nitrox.dlc.mirror.model.ApplicationServiceModel
DOMAIN_EVENT --> nitrox.dlc.mirror.model.DomainEventModel
DOMAIN_COMMAND --> nitrox.dlc.mirror.model.DomainCommandModel"""

prompts = [
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.EntityModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.ValueObjectModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.AggregateRootModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.IdentityModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.EnumModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.DomainServiceModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.RepositoryModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.ApplicationServiceModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.DomainEventModel\"",
    "<START> { \"@class\" : \"nitrox.dlc.mirror.model.DomainCommandModel\""
]

In [11]:


def generate_multi_prompts(prompts, model, tokenizer, use_custom_eos=True, custom_eos_token='"valueObject" : true}', 
max_length=6000, confidence_threshold=0.01, export_path="gen_json"):
    import tqdm
    for prompt in tqdm.tqdm(prompts):
        generated_text = generate_nitrox_json(model, tokenizer, prompt, use_custom_eos=use_custom_eos, custom_eos_token=custom_eos_token, max_length=max_length, confidence_threshold=confidence_threshold)

        # save generated text as file named after the model Type
        with open(export_path + "/" + prompt.split("nitrox.dlc.mirror.model.")[1].split(",")[0].replace("\"","") + ".json", "w") as f:
            f.write(generated_text)


In [12]:
# generate generate_multi_prompts
generate_multi_prompts(prompts, model, tokenizer, use_custom_eos=True, custom_eos_token="\"methods\": [", max_length=1024, confidence_threshold=0.01, export_path="gen_json")


  0%|          | 0/10 [18:37<?, ?it/s]


KeyboardInterrupt: 

In [12]:
## generate further
prompt_test = "{\"@class\" : \"nitrox.dlc.mirror.model.ValueObjectModel\",\"typeName\" : \"tests.shared.persistence.domain.inheritanceExtended.BikeComponent\",\"abstract\" : false,\"allFields\" : [ {  \"@class\" : \"nitrox.dlc.mirror.model.ValueReferenceModel\",  \"name\" : \"type\",  \"type\" : {    \"@class\" : \"nitrox.dlc.mirror.model.AssertedContainableTypeModel\",    \"typeName\" : \"tests.shared.persistence.domain.inheritanceExtended.BikeComponent$BikeComponentType\",    \"domainType\" : \"ENUM\",    \"assertions\" : [ ],    \"hasOptionalContainer\" : false,    \"hasCollectionContainer\" : false,    \"hasListContainer\" : false,    \"hasSetContainer\" : false,    \"hasStreamContainer\" : false,    \"containerTypeName\" : null,    \"containerAssertions\" : [ ]  },  \"accessLevel\" : \"PRIVATE\",  \"declaredByTypeName\" : \"tests.shared.persistence.domain.inheritanceExtended.BikeComponent\",  \"modifiable\" : false,  \"publicReadable\" : true,  \"publicWriteable\" : false,  \"static\" : false}, {  \"@class\" : \"nitrox.dlc.mirror.model.FieldModel\",  \"name\" : \"manufacturer\",  \"type\" : {    \"@class\" : \"nitrox.dlc.mirror.model.AssertedContainableTypeModel\",    \"typeName\" : \"java.lang.String\",    \"domainType\" : \"NON_DOMAIN\",    \"assertions\" : [ ],    \"hasOptionalContainer\" : false,    \"hasCollectionContainer\" : false,    \"hasListContainer\" : false,    \"hasSetContainer\" : false,    \"hasStreamContainer\" : false,    \"containerTypeName\" : null,    \"containerAssertions\" : [ ]  },  \"accessLevel\" : \"PRIVATE\",  \"declaredByTypeName\" : \"tests.shared.persistence.domain.inheritanceExtended.BikeComponent\",  \"modifiable\" : false,  \"publicReadable\" : true,  \"publicWriteable\" : false,  \"static\" : false} ],\"methods\" : ["

In [13]:
# generate prompt_test json
generated_text = generate_nitrox_json(model, tokenizer, prompt_test, use_custom_eos=True, custom_eos_token="\"valueObject\" : true}", max_length=3000, confidence_threshold=0.01)

{"@class" : "nitrox.dlc.mirror.model.ValueObjectModel","typeName" : "tests.shared.persistence.domain.inheritanceExtended.BikeComponent","abstract" : false,"allFields" : [ {  "@class" : "nitrox.dlc.mirror.model.ValueReferenceModel",  "name" : "type",  "type" : {    "@class" : "nitrox.dlc.mirror.model.AssertedContainableTypeModel",    "typeName" : "tests.shared.persistence.domain.inheritanceExtended.BikeComponent$BikeComponentType",    "domainType" : "ENUM",    "assertions" : [ ],    "hasOptionalContainer" : false,    "hasCollectionContainer" : false,    "hasListContainer" : false,    "hasSetContainer" : false,    "hasStreamContainer" : false,    "containerTypeName" : null,    "containerAssertions" : [ ]  },  "accessLevel" : "PRIVATE",  "declaredByTypeName" : "tests.shared.persistence.domain.inheritanceExtended.BikeComponent",  "modifiable" : false,  "publicReadable" : true,  "publicWriteable" : false,  "static" : false}, {  "@class" : "nitrox.dlc.mirror.model.FieldModel",  "name" : "man

KeyboardInterrupt: 

In [None]:
import json
import os

files = []
for file in os.listdir("gen_json"):
    if file.endswith(".json"):
        files.append(file)

files[0]

In [None]:
# check json files for json syntax errors
def check_json_files(files):
    for file in files:
        with open("gen_json/" + file, "r") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                print("Error in file: ", file, e)

# check json files for errors but ignore unclosed brackets
def check_json_files(files, ignore_brackets=True):
    for file in files:
        with open("gen_json/" + file, "r") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                if ignore_brackets:
                    if "Expecting value" in str(e):
                        continue
                print("Error in file: ", file, e)

In [None]:
# check all files for errors
check_json_files(files, ignore_brackets=False)
