# Defining Prompts

In [2]:
system_prompt = """
You are a JSON-only response system. Follow these rules absolutely:
1. ONLY output valid, parseable JSON
2. NEVER include text before or after the JSON
3. NEVER include markdown code blocks or formatting
4. NEVER include explanations
5. NEVER extract dates
6. If you can't fulfill a request, return {"error": "error message"}
7. Output should always be a single JSON object

For address requests, use this format:
{
    "address": {
        "license": "B1231241",
        "Address": "X City",
        "Sex": "Male",
        "Weight": "X",
        "Height": "X"
    }
}
"""

In [3]:
test_instruction = '''extract NER:
        California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        ISS 08/31/2009
'''

# Inference

In [11]:
import torch
from threading import Thread
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.generation.streamers import TextIteratorStreamer

model_name="ibm-granite/granite-3.3-2b-instruct"

def initialize_model(model_name=model_name):
    """Initialize tokenizer and model"""
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_8bit=True  # Enable 8-bit quantization
    )
    return tokenizer, model

def generate_streaming_response(system_prompt, user_instruction, tokenizer, model):
    """Generate a streaming response from the model"""
    # Format prompt according to model's expected chat template
    # This example uses Llama-2's chat format - adjust for other models
    prompt = f"<s>[INST] {system_prompt}\n\n{user_instruction} [/INST]"

    # Tokenize input
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    # Create a streamer object
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Generation parameters
    generate_kwargs = {
        "input_ids": input_ids,
        "streamer": streamer,
        "max_new_tokens": 512,
        "do_sample": True,
        "top_p": 0.95,
        "top_k": 50,
        "temperature": 0.7,
    }

    print("Starting text generation...")
    start_time = time.perf_counter()

    # Create a thread to run generation
    thread = Thread(target=model.generate, kwargs=generate_kwargs)
    thread.start()

    # Iterate over the generated text as it becomes available
    for text_chunk in streamer:
        print(text_chunk, end="", flush=True)

    # Wait for generation to finish
    thread.join()

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time

    print(f"\nTime taken: {elapsed_time:.4f} seconds")
    return elapsed_time

def chatbot_with_timing():
    """Interactive chatbot with timing information"""
    # Initialize model and tokenizer

    # You might need Hugging Face token for some models
    # import os
    # os.environ["HF_TOKEN"] = "your_hugging_face_token"

    try:
        tokenizer, model = initialize_model(model_name)
    except Exception as e:
        print(f"Error initializing model: {e}")
        print("Check if you have the correct model name and access permissions.")
        return

    print("Chatbot initialized. Type 'exit' to end the conversation.")

    # Keep track of all response times
    response_times = []

    while True:
        user_input = input("\nYou: ")
        if user_input.lower() == 'exit':
            if response_times:
                avg_time = sum(response_times) / len(response_times)
                print(f"\nAverage response time: {avg_time:.4f} seconds")
            break

        print("\nBot: ", end="")
        elapsed = generate_streaming_response(system_prompt, user_input, tokenizer, model)
        response_times.append(elapsed)

if __name__ == "__main__":
    # Example one-off usage
    # system_prompt = "You are a helpful AI assistant."
    # test_instruction = "Explain quantum computing in simple terms."
    # tokenizer, model = initialize_model()
    # generate_streaming_response(system_prompt, test_instruction, tokenizer, model)

    # Or run interactive mode
    chatbot_with_timing()

Loading model: ibm-granite/granite-3.3-2b-instruct


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Chatbot initialized. Type 'exit' to end the conversation.

You: extract NER:         California         DRIVER LICENSe         dl 11234568         CLASS C         EXP 08/31/2014         END NONE         LNCARDHOLDER FNIMA         2570 24TH STREET ANYTOWN, CA 95818         doB 08/31/1977 RSTR NONE         08311977         VETERAN         Cordhslde         SEX F HGT 5'-05"         HAIR BRN WGT 125 lb         EYES BRN         DD 00/00/0000NNNAN/ANFD/YY         ISS 08/31/2009

Bot: Starting text generation...


{
    "address": {
        "license": "dl 11234568",
        "Address": "2570 24TH STREET ANYTOWN, CA 95818",
        "Sex": "F",
        "Weight": "125 lb",
        "Height": "5'-05\""
    }
}
Time taken: 18.0504 seconds

You: exit

Average response time: 18.0504 seconds


In [10]:
import torch
from threading import Thread
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.generation.streamers import TextIteratorStreamer

model_name='ibm-granite/granite-3.2-2b-instruct'

def initialize_model(model_name=model_name):
    """Initialize tokenizer and model"""
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_8bit=True  # Enable 8-bit quantization
    )
    return tokenizer, model

def generate_streaming_response(system_prompt, user_instruction, tokenizer, model):
    """Generate a streaming response from the model"""
    # Format prompt according to model's expected chat template
    # This example uses Llama-2's chat format - adjust for other models
    prompt = f"<s>[INST] {system_prompt}\n\n{user_instruction} [/INST]"

    # Tokenize input
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    # Create a streamer object
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Generation parameters
    generate_kwargs = {
        "input_ids": input_ids,
        "streamer": streamer,
        "max_new_tokens": 512,
        "do_sample": True,
        "top_p": 0.95,
        "top_k": 50,
        "temperature": 0.7,
    }

    print("Starting text generation...")
    start_time = time.perf_counter()

    # Create a thread to run generation
    thread = Thread(target=model.generate, kwargs=generate_kwargs)
    thread.start()

    # Iterate over the generated text as it becomes available
    for text_chunk in streamer:
        print(text_chunk, end="", flush=True)

    # Wait for generation to finish
    thread.join()

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time

    print(f"\nTime taken: {elapsed_time:.4f} seconds")
    return elapsed_time

def chatbot_with_timing():
    """Interactive chatbot with timing information"""
    # Initialize model and tokenizer

    # You might need Hugging Face token for some models
    # import os
    # os.environ["HF_TOKEN"] = "your_hugging_face_token"

    try:
        tokenizer, model = initialize_model(model_name)
    except Exception as e:
        print(f"Error initializing model: {e}")
        print("Check if you have the correct model name and access permissions.")
        return

    print("Chatbot initialized. Type 'exit' to end the conversation.")

    # Keep track of all response times
    response_times = []

    while True:
        user_input = input("\nYou: ")
        if user_input.lower() == 'exit':
            if response_times:
                avg_time = sum(response_times) / len(response_times)
                print(f"\nAverage response time: {avg_time:.4f} seconds")
            break

        print("\nBot: ", end="")
        elapsed = generate_streaming_response(system_prompt, user_input, tokenizer, model)
        response_times.append(elapsed)

if __name__ == "__main__":
    # Example one-off usage
    # system_prompt = "You are a helpful AI assistant."
    # test_instruction = "Explain quantum computing in simple terms."
    # tokenizer, model = initialize_model()
    # generate_streaming_response(system_prompt, test_instruction, tokenizer, model)

    # Or run interactive mode
    chatbot_with_timing()

Loading model: ibm-granite/granite-3.2-2b-instruct


tokenizer_config.json:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/786 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/29.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Chatbot initialized. Type 'exit' to end the conversation.

You: extract NER:         California         DRIVER LICENSe         dl 11234568         CLASS C         EXP 08/31/2014         END NONE         LNCARDHOLDER FNIMA         2570 24TH STREET ANYTOWN, CA 95818         doB 08/31/1977 RSTR NONE         08311977         VETERAN         Cordhslde         SEX F HGT 5'-05"         HAIR BRN WGT 125 lb         EYES BRN         DD 00/00/0000NNNAN/ANFD/YY         ISS 08/31/2009

Bot: Starting text generation...


{
    "ner": {
        "location": "California",
        "driver_license": "dl 11234568",
        "license_class": "CLASS C",
        "expiration_date": "08/31/2014",
        "holder_name": "FNIMA",
        "address": {
            "street": "2570 24TH STREET",
            "city": "ANYTOWN",
            "state": "CA",
            "zip": "95818"
        },
        "dob": "08/31/1977",
        "veteran_status": "VETERAN",
        "physical_description": {
            "height": "5'-05\

In [8]:
import torch
from threading import Thread
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.generation.streamers import TextIteratorStreamer

model_name='ibm-granite/granite-3.1-2b-instruct'

def initialize_model(model_name=model_name):
    """Initialize tokenizer and model"""
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_8bit=True  # Enable 8-bit quantization
    )
    return tokenizer, model

def generate_streaming_response(system_prompt, user_instruction, tokenizer, model):
    """Generate a streaming response from the model"""
    # Format prompt according to model's expected chat template
    # This example uses Llama-2's chat format - adjust for other models
    prompt = f"<s>[INST] {system_prompt}\n\n{user_instruction} [/INST]"

    # Tokenize input
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    # Create a streamer object
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Generation parameters
    generate_kwargs = {
        "input_ids": input_ids,
        "streamer": streamer,
        "max_new_tokens": 512,
        "do_sample": True,
        "top_p": 0.95,
        "top_k": 50,
        "temperature": 0.7,
    }

    print("Starting text generation...")
    start_time = time.perf_counter()

    # Create a thread to run generation
    thread = Thread(target=model.generate, kwargs=generate_kwargs)
    thread.start()

    # Iterate over the generated text as it becomes available
    for text_chunk in streamer:
        print(text_chunk, end="", flush=True)

    # Wait for generation to finish
    thread.join()

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time

    print(f"\nTime taken: {elapsed_time:.4f} seconds")
    return elapsed_time

def chatbot_with_timing():
    """Interactive chatbot with timing information"""
    # Initialize model and tokenizer

    # You might need Hugging Face token for some models
    # import os
    # os.environ["HF_TOKEN"] = "your_hugging_face_token"

    try:
        tokenizer, model = initialize_model(model_name)
    except Exception as e:
        print(f"Error initializing model: {e}")
        print("Check if you have the correct model name and access permissions.")
        return

    print("Chatbot initialized. Type 'exit' to end the conversation.")

    # Keep track of all response times
    response_times = []

    while True:
        user_input = input("\nYou: ")
        if user_input.lower() == 'exit':
            if response_times:
                avg_time = sum(response_times) / len(response_times)
                print(f"\nAverage response time: {avg_time:.4f} seconds")
            break

        print("\nBot: ", end="")
        elapsed = generate_streaming_response(system_prompt, user_input, tokenizer, model)
        response_times.append(elapsed)

if __name__ == "__main__":
    # Example one-off usage
    # system_prompt = "You are a helpful AI assistant."
    # test_instruction = "Explain quantum computing in simple terms."
    # tokenizer, model = initialize_model()
    # generate_streaming_response(system_prompt, test_instruction, tokenizer, model)

    # Or run interactive mode
    chatbot_with_timing()

Loading model: ibm-granite/granite-3.1-2b-instruct


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Chatbot initialized. Type 'exit' to end the conversation.

You: extract NER:         California         DRIVER LICENSe         dl 11234568         CLASS C         EXP 08/31/2014         END NONE         LNCARDHOLDER FNIMA         2570 24TH STREET ANYTOWN, CA 95818         doB 08/31/1977 RSTR NONE         08311977         VETERAN         Cordhslde         SEX F HGT 5'-05"         HAIR BRN WGT 125 lb         EYES BRN         DD 00/00/0000NNNAN/ANFD/YY         ISS 08/31/2009


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Bot: Starting text generation...


{
    "address": {
        "license": "dl 11234568",
        "Address": "2570 24TH STREET ANYTOWN, CA 95818",
        "Sex": "F",
        "Weight": "125 lb",
        "Height": "5'-05\""
    }
}
Time taken: 19.2097 seconds

You: exit

Average response time: 19.2097 seconds
