# <center> WizardLM-13B-V1.2-GPTQ

#### Resources checkout

In [None]:
!nvidia-smi

#### Donwnloads (run once)

In [None]:
!pip3 install  -Uq transformers
!pip3 install  -Uq torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip3 install  -Uq sentencepiece
!pip3 install  -Uq accelerate
!pip3 install  -Uq bitsandbytes
!pip3 install  -Uq auto_gptq

#### Imports

In [None]:
from transformers import AutoTokenizer, GenerationConfig
from auto_gptq import AutoGPTQForCausalLM
import torch

#### Loading model to VRAM

In [None]:
BASE_MODEL_ID = "TheBloke/WizardLM-13B-V1.2-GPTQ"
MODEL_BASENAME = "gptq_model-4bit-128g"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(BASE_MODEL_ID,
                                            model_basename=MODEL_BASENAME,
                                            use_safetensors=True,
                                            device="cuda:0",
                                            use_triton=use_triton,
                                            trust_remote_code=True,
                                            quantize_config=None)

#### Prompt

In [None]:
# Here is your prompt
prompt = '''create Java classes according to this PlantUML diagram:

@startuml

entity Book {
id: long
name: string
language: string
yearOfPublishing: int
}

entity Author {
id: long
firstName: string
lastName: string
nationality: string
}

enum Genre {
SCIENCE_FICTION
FANTASY
HORROR
TECHNICAL_WRITING
EROTIC_FICTION
WESTERN
OTHER
}

Book "1..N" -o "1" Author: books
Book "1" o- "1..N" Genre: genres
Author "1" o- "1..N" Genre: genres
Book "1..N" o- "1" Author: author

@enduml'''

# Here is template that model can understand your prompt and make response 
prompt_template=f'''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.

USER: {prompt}
ASSISTANT:

'''

#### Inference

In [None]:
generation_config = GenerationConfig(
    top_p=0.0,
    top_k=1,
    temperature=0.0,
    max_new_tokens=600,
    # repetition_penalty=1.15,
    pad_token_id = tokenizer.eos_token_id,
    eos_token_id = tokenizer.eos_token_id
)   

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoding = tokenizer(prompt_template, return_tensors="pt").to(device)

with torch.inference_mode(): 
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))