# JSON Evaluation Pipeline

In [1]:
# !pip install transformers
# !pip install torch
# !pip install pandas
# !pip3 install torch torchvision torchaudio
# !pip install ipywidgets
# !pip install bitsandbytes
# !pip install accelerate

In [2]:
import transformers
import torch

## Model Import

In [3]:
## define run name
run_name = "finalTraining_v1"
# run_name = "MLPC-2048-StarCoderBase7B"

# define model for tokenizer
model_name = "codellama/CodeLlama-7b-hf"
# model_name = "bigcode/starcoderbase-7b"

# dataset import folder
export_folder = "./dataset/" + run_name + "/"

# model save path
model_save_path = "./models/" + run_name + "/"

# model checkpoint path
model_checkpoint_path = "./checkpoints/" + run_name + "/"

In [4]:
## Test loading model and inference with that model

# load quantization config for 4bit quantization -> must be same as training
quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)

# load model from model_save_path with quantization config
model = transformers.AutoModelForCausalLM.from_pretrained(model_save_path, quantization_config=quantization_config, low_cpu_mem_usage=True)

# optional: load model untrained
# model = transformers.AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, low_cpu_mem_usage=True)

# optional: load model unquantized and untrained
# model = transformers.AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)

# optional: load model from checkpoint
# model = transformers.AutoModelForCausalLM.from_pretrained("./output/bigRun/checkpoint-1000", quantization_config=quantization_config, low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# add pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

## Generator

In [6]:
def generateJSON(model, tokenizer, prompt="<START> {", temperature=0.1, max_length=300, end_token="<END>", pad_token="[PAD]"):
    
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    end_token_id = tokenizer.encode(end_token, add_special_tokens=False)[0]
    pad_token_id = tokenizer.encode(pad_token, add_special_tokens=True)[0]
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    output = model.generate(input_ids, pad_token_id=pad_token_id, eos_token_id=end_token_id, temperature=temperature, max_length=max_length)

    output_dict = {}
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    output_dict["generated_text"] = generated_text
    
    if output.shape[1] == max_length:
        # print("The output was cut off because it reached the maximum length.")
        output_dict["status"] = "max_lim"
        return output_dict
    elif output[0][-1] == end_token_id:
        # print("The output ended with the end-of-sequence token.")
        output_dict["status"] = "eos_token"
        return generated_text
    else:
        print("The output ended for an unknown reason.")
        return "Error"

In [7]:
# output_dict = generateJSON(model, tokenizer, prompt="<START> {", temperature=0.1, max_gen_length=10, end_token="<END>")

In [8]:
# output_dict['generated_text']

In [9]:
prompt = "<START>{ \"@class\" : \"nitrox.dlc.mirror.model.FieldModel\","

In [10]:
with open('generated_json.json', 'w') as f:
        cleaned_prompt = prompt.replace("<START>", "")
        f.write(cleaned_prompt)

In [11]:
output_dict = generateJSON(model, tokenizer, prompt=prompt, temperature=0.1, max_length=300, end_token="<END>", pad_token="[PAD]")



In [12]:
output_dict['status']

'max_lim'

In [13]:
gen_text = output_dict['generated_text']
gen_text

'<START>{ "@class" : "nitrox.dlc.mirror.model.FieldModel", "name" : "VAR_name", "type" : {"@class" : "nitrox.dlc.mirror.model.AssertedContainableTypeModel", "typeName" : "VAR_typeName", "domainType" : "NON_DOMAIN", "assertions" : [{"@class" : "nitrox.dlc.mirror.model.AssertionModel", "assertionType" : "isNotNull", "param1" : null, "param2" : null, "message" : "{jakarta.validation.constraints.NotNull.message}"}], "hasOptionalContainer" : false, "hasCollectionContainer" : false, "hasListContainer" : false, "hasSetContainer" : false, "hasStreamContainer" : false, "containerTypeName" : "VAR_containerTypeName", "containerAssertions" : []}, "accessLevel" : "PACKAGE", "declaredByTypeName" : "VAR_declaredByTypeName", "modifiable" : true, "publicReadable" : true, "publicWriteable" : false, "static" : false}, {"@class" : "nitrox.dlc.mirror.model.FieldModel", "'

In [14]:
if gen_text.startswith(prompt):
        result = gen_text[len(prompt):]

result

' "name" : "VAR_name", "type" : {"@class" : "nitrox.dlc.mirror.model.AssertedContainableTypeModel", "typeName" : "VAR_typeName", "domainType" : "NON_DOMAIN", "assertions" : [{"@class" : "nitrox.dlc.mirror.model.AssertionModel", "assertionType" : "isNotNull", "param1" : null, "param2" : null, "message" : "{jakarta.validation.constraints.NotNull.message}"}], "hasOptionalContainer" : false, "hasCollectionContainer" : false, "hasListContainer" : false, "hasSetContainer" : false, "hasStreamContainer" : false, "containerTypeName" : "VAR_containerTypeName", "containerAssertions" : []}, "accessLevel" : "PACKAGE", "declaredByTypeName" : "VAR_declaredByTypeName", "modifiable" : true, "publicReadable" : true, "publicWriteable" : false, "static" : false}, {"@class" : "nitrox.dlc.mirror.model.FieldModel", "'

In [15]:
with open('generated_json.json', 'a') as f:
        f.write(result)

In [16]:
words = result.split()
if len(words) >= 30:
        prompt = words[-30:]
        prompt = ' '.join(prompt)

prompt

': false, "containerTypeName" : "VAR_containerTypeName", "containerAssertions" : []}, "accessLevel" : "PACKAGE", "declaredByTypeName" : "VAR_declaredByTypeName", "modifiable" : true, "publicReadable" : true, "publicWriteable" : false, "static" : false}, {"@class" : "nitrox.dlc.mirror.model.FieldModel", "'

In [17]:
# Clean VRAM
del output_dict
torch.cuda.empty_cache()


In [18]:
output_dict = generateJSON(model, tokenizer, prompt=prompt, temperature=0.1, max_length=300, end_token="<END>", pad_token="[PAD]")

In [19]:
output_dict['status']

'max_lim'

In [20]:
gen_text = output_dict['generated_text']
gen_text

': false, "containerTypeName" : "VAR_containerTypeName", "containerAssertions" : []}, "accessLevel" : "PACKAGE", "declaredByTypeName" : "VAR_declaredByTypeName", "modifiable" : true, "publicReadable" : true, "publicWriteable" : false, "static" : false}, {"@class" : "nitrox.dlc.mirror.model.FieldModel", "name" : "VAR_name", "type" : {"@class" : "nitrox.dlc.mirror.model.AssertedContainableTypeModel", "typeName" : "VAR_typeName", "domainType" : "NON_DOMAIN", "assertions" : [], "hasOptionalContainer" : false, "hasCollectionContainer" : false, "hasListContainer" : false, "hasSetContainer" : false, "hasStreamContainer" : false, "containerTypeName" : "VAR_containerTypeName", "containerAssertions" : []}, "accessLevel" : "PACKAGE", "declaredByTypeName" : "VAR_declaredByTypeName", "modifiable" : true, "publicReadable" : true, "publicWriteable" : false, "static" : false}, {"@class" : "nitrox'

In [21]:
if gen_text.startswith(prompt):
        result = gen_text[len(prompt):]

result

'name" : "VAR_name", "type" : {"@class" : "nitrox.dlc.mirror.model.AssertedContainableTypeModel", "typeName" : "VAR_typeName", "domainType" : "NON_DOMAIN", "assertions" : [], "hasOptionalContainer" : false, "hasCollectionContainer" : false, "hasListContainer" : false, "hasSetContainer" : false, "hasStreamContainer" : false, "containerTypeName" : "VAR_containerTypeName", "containerAssertions" : []}, "accessLevel" : "PACKAGE", "declaredByTypeName" : "VAR_declaredByTypeName", "modifiable" : true, "publicReadable" : true, "publicWriteable" : false, "static" : false}, {"@class" : "nitrox'

In [22]:
with open('generated_json.json', 'a') as f:
        f.write(result)

In [9]:
prompt = "<START>{ \"@class\" : \"nitrox.dlc.mirror.model.FieldModel\","
for i in range(30):
    print(f"RUN: {i}\nStart new run with prompt:\n{prompt}")

    print("\n\nGenerating ⏳")
    output_dict = generateJSON(model, tokenizer, prompt=prompt, temperature=0.1, max_length=300, end_token="<END>", pad_token="[PAD]")
    print("\n\nGenerated ✅")
    
    if output_dict['status'] != "eos_token":
        print(output_dict['status'])
    else:
        print("This is the end! 🥳:")
        print(output_dict['generated_text'])
        break

    
    # generated Text
    gen_text = output_dict['generated_text']

    print(f"🤖 I Generated this text: {gen_text}")
    
    # save generated text without prompt:
    if gen_text.startswith(prompt):
        result = gen_text[len(prompt):]
    else:
        print("ERROR: The Prompt is not in the generated text!")
        break
        
    
    with open('generated_text.txt', 'a') as f:
        f.write(result)
    

    # create prompt for next run with last 30 characters
    words = gen_text.split()

    if len(words) >= 30:
        prompt = words[-30:]
        prompt = ' '.join(prompt)

    else:
        print(f"generated Text is not long enough! The length ist: {len(words)}\n")
        break

    

    
    del output_dict
    torch.cuda.empty_cache()


RUN: 0
Start new run with prompt:
<START> {"nitrox.dlc.mirror.model.FieldModel",


Generating ⏳






Generated ✅
max_lim
🤖 I Generated this text: <START> {"nitrox.dlc.mirror.model.FieldModel", "name": "VAR_name", "type": {"@class": "nitrox.dlc.mirror.model.AssertedContainableTypeModel", "typeName": "VAR_typeName", "domainType": "NON_DOMAIN", "assertions": [], "hasOptionalContainer": false, "hasCollectionContainer": false, "hasListContainer": false, "hasSetContainer": false, "hasStreamContainer": false, "containerTypeName": "VAR_containerTypeName", "containerAssertions": []}, "accessLevel": "PACKAGE", "declaredByTypeName": "VAR_declaredByTypeName", "modifiable": true, "publicReadable": true, "publicWriteable": false, "static": false}, "inheritanceHierarchyTypeNames": "VAR_inheritanceHierarchyTypeNames", "allInterfaceTypeNames": "VAR_allInterfaceTypeNames"}<END>}, {"@class": "nitrox.dlc.mirror.model.ValueObjectModel", "typeName": "VAR_typeName", "domainType": "VALUE_OBJECT", "assertions": [], "hasOptionalContainer": false, "hasCollectionContainer": false, "hasListContainer": false, "h





Generated ✅
max_lim
🤖 I Generated this text: "PACKAGE", "declaredByTypeName": "VAR_declaredByTypeName", "modifiable": true, "publicReadable": true, "publicWriteable": false, "static": false}, "inheritanceHierarchyTypeNames": "VAR_inheritanceHierarchyTypeNames", "allInterfaceTypeNames": "VAR_allInterfaceTypeNames"}<END>}, {"@class": "nitrox.dlc.mirror.model.ValueObjectModel", "typeName": "VAR_typeName", "domainType": "VALUE_OBJECT", "assertions": [], "hasOptionalContainer": false, "hasCollectionContainer": false, "hasListContainer": false, "hasSetContainer": false, "hasStreamContainer": false, "containerTypeName": "VAR_containerTypeName", "containerAssertions": [], "static": false, "accessLevel": "PRIVATE", "declaredByTypeName": "VAR_declaredByTypeName", "methods": [{"@class": "nitrox.dlc.mirror.model.MethodModel", "name": "VAR_name", "declaredByTypeName": "VAR_declaredByTypeName", "accessLevel": "PUBLIC", "parameters": [{"@class": "nitrox.dlc.mirror.model.


KeyboardInterrupt: 

In [None]:
import json

# Try to parse the generated text as JSON
try:
    parsed_json = json.loads(generated_text)
    print("The generated text is valid JSON.")

    # Format the JSON
    formatted_json = json.dumps(parsed_json, indent=4)

    # save formatted json to file
    with open("generated_json.json", "w") as f:
        f.write(formatted_json)

except json.JSONDecodeError as e:
    print(f"The generated text is not valid JSON. Error: {e}")