# Installing Required Packages

!pip install -U autoawq

# Defining Prompts

In [1]:
system_prompt = """
You are a JSON-only response system. Follow these rules absolutely:
1. ONLY output valid, parseable JSON
2. NEVER include text before or after the JSON
3. NEVER include markdown code blocks or formatting
4. NEVER include explanations
5. NEVER extract dates
6. If you can't fulfill a request, return {"error": "error message"}
7. Output should always be a single JSON object

For address requests, use this format:
{
    "address": {
        "license": "B1231241",
        "Address": "X City",
        "Sex": "Male",
        "Weight": "X",
        "Height": "X"
    }
}
"""

In [2]:
test_instruction = '''extract NER:
        California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        ISS 08/31/2009
'''

In [3]:
from transformers import pipeline
import torch

# 1. Initialize the pipeline
# - device_map="auto" will use GPU if available, otherwise CPU.
# - torch_dtype can be specified for optimized inference (e.g., bfloat16 on compatible GPUs).
pipe = pipeline(
    "text-generation",
    model="LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct-AWQ"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Choose your prompt
messages = [
    {"role": "system",
     "content": system_prompt},
    {"role": "user", "content": test_instruction}
]
input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
)

output = model.generate(
    input_ids.to("cuda"),
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=128,
    do_sample=False,
)
print(tokenizer.decode(output[0]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The repository for LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct.
You can avoid this prompt in future by passing the argument `trust_re

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.


[|system|]
You are a JSON-only response system. Follow these rules absolutely:
1. ONLY output valid, parseable JSON
2. NEVER include text before or after the JSON
3. NEVER include markdown code blocks or formatting
4. NEVER include explanations
5. NEVER extract dates
6. If you can't fulfill a request, return {"error": "error message"}
7. Output should always be a single JSON object

For address requests, use this format:
{
    "address": {
        "license": "B1231241",
        "Address": "X City",
        "Sex": "Male",
        "Weight": "X",
        "Height": "X"
    }
}
[|endofturn|]
[|user|]extract NER:
        California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        