In [None]:
!pip install --upgrade transformers accelerate bitsandbytes torch

In [None]:
!pip install bitsandbytes

In [9]:
system_prompt = """
You are a JSON-only response system. Follow these rules absolutely:
1. ONLY output valid, parseable JSON
2. NEVER include text before or after the JSON
3. NEVER include markdown code blocks or formatting
4. NEVER include explanations
5. NEVER extract dates
6. If you can't fulfill a request, return {"error": "error message"}
7. EXTRACT EVERYTHING
8. Output should always be a single JSON object

For address requests, use this format:
{
    "address": {
...
    }
}
"""

In [7]:
test_instruction = '''extract NER:
        California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        ISS 08/31/2009
'''

In [None]:
from transformers import pipeline
import torch

# 1. Initialize the pipeline
# - device_map="auto" will use GPU if available, otherwise CPU.
# - torch_dtype can be specified for optimized inference (e.g., bfloat16 on compatible GPUs).
pipe = pipeline(
    "text-generation",
    model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/48.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-000004.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

model-00004-of-000004.safetensors:   0%|          | 0.00/3.49G [00:00<?, ?B/s]

model-00003-of-000004.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

model-00001-of-000004.safetensors:   0%|          | 0.00/8.71G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import time  # Import the time module

# --- Assuming generate_text, model, tokenizer, test_instruction are defined elsewhere ---

print("Starting text generation...") # Optional: Indicate start

start_time = time.perf_counter()  # Get the time just before the call

####################

# 2. Define messages including the system prompt
# The system prompt guides the model's persona, tone, and behavior.
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": test_instruction},
]

# 3. Generate response
# - max_new_tokens: limits the length of the generated response.
# - do_sample=True: enables sampling for more diverse outputs.
# - temperature/top_p: control randomness and diversity.
# - return_full_text=False: Ensures only the newly generated text is returned, not the whole prompt.
#   This is crucial for chat interactions where you only want the assistant's reply.
outputs = pipe(
    messages,
    max_new_tokens=10000,
    do_sample=True,
    temperature=0.007,
    top_p=0.009,
    return_full_text=False, # IMPORTANT!
    # The pipeline automatically applies the chat template which adds the prompt for the assistant's turn.
    # It also handles special tokens like bos, eos, and role tokens.
)

# 4. Print the system prompt, user query, and model's response
# print("--- System Prompt ---")
# print(messages[0]['content'])
# print("\n--- User Query ---")
# print(messages[1]['content'])

print("\n--- Model's Response ---")

if outputs and isinstance(outputs, list) and 'generated_text' in outputs[0]:
    assistant_response = outputs[0]['generated_text'].strip()

    # Llama 3 models (and others) often end their generation with an EOS token.
    # The pipeline might or might not strip this depending on settings/model config.
    # It's good practice to check and remove it if present for cleaner output.
    # For Llama 3, the EOS token is "<|eot_id|>"
    if hasattr(pipe.tokenizer, 'eos_token') and assistant_response.endswith(pipe.tokenizer.eos_token):
        assistant_response = assistant_response[:-len(pipe.tokenizer.eos_token)].strip()

    print(assistant_response)
else:
    print("Could not retrieve generated text from pipeline output.")
    print("Raw output:", outputs)


# To understand what the pipeline sends to the model, you can inspect the tokenizer's chat template application:
# This is purely for informational purposes to see the formatted string.
print("\n\n--- For Information: How the input gets formatted by tokenizer.apply_chat_template ---")
# `add_generation_prompt=True` is what the pipeline does to signal the model to generate the assistant's part.
formatted_prompt_for_model = pipe.tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True
)
print(repr(formatted_prompt_for_model)) # Use repr to see special tokens like \n and <|...|>
print("--- End of Information ---")


#########################################

end_time = time.perf_counter()    # Get the time just after the call

elapsed_time = end_time - start_time # Calculate the difference

# Print the elapsed time
print(f"\nTime taken: {elapsed_time:.4f} seconds")

#Loading in 4 bit

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import time

# --- Prerequisites & Definitions ---
# (Ensure libraries are installed: pip install transformers torch bitsandbytes accelerate sentencepiece)
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"

print("Initializing model and tokenizer with 4-bit quantization...")

quantization_config_manual = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

try:
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config_manual,
        torch_dtype=torch.bfloat16, # Or torch.float16 if bfloat16 not supported
        device_map="auto",
        trust_remote_code=True
    )
    print("Model and tokenizer initialized successfully.")
except Exception as e:
    print(f"Error initializing model or tokenizer: {e}")
    print("Please ensure you have a compatible GPU, CUDA installed, and the bitsandbytes library correctly set up.")
    exit()

# Set EOS token if not already set (important for some models/tokenizers)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # Common practice

Initializing model and tokenizer with 4-bit quantization...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model and tokenizer initialized successfully.


In [10]:
print("Starting text generation...")
start_time = time.perf_counter()

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": test_instruction},
]

try:
    # The pipeline usually handles chat templating. For manual use, apply it.
    # The informational print in your original output suggests apply_chat_template works.
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True # Crucial to signal the model to generate assistant's response
    )
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # Move inputs to the model's device

    # Generate response
    # Note: model.generate does not take 'return_full_text'. Decoding handles this.
    generated_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask, # Pass attention_mask
        max_new_tokens=1000,
        do_sample=True,
        temperature=0.007, # Still very low, consider increasing for diversity
        top_p=0.009,       # Still very low
        pad_token_id=tokenizer.eos_token_id # Important for some models during generation
    )

    # Decode only the newly generated tokens
    # The generated_ids will contain the input_ids followed by the new tokens.
    assistant_response_ids = generated_ids[0][inputs.input_ids.shape[-1]:]
    assistant_response = tokenizer.decode(assistant_response_ids, skip_special_tokens=True)

except Exception as e:
    print(f"Error during text generation: {e}")
    assistant_response = "Error retrieving response."

end_time = time.perf_counter()
elapsed_time = end_time - start_time

print("\n--- System Prompt ---")
print(system_prompt)
print("\n--- User Query ---")
print(test_instruction)
print("\n--- Model's Response ---")
print(assistant_response.strip())

print(f"\nTime taken: {elapsed_time:.4f} seconds")

Starting text generation...

--- System Prompt ---

You are a JSON-only response system. Follow these rules absolutely:
1. ONLY output valid, parseable JSON
2. NEVER include text before or after the JSON
3. NEVER include markdown code blocks or formatting
4. NEVER include explanations
5. NEVER extract dates
6. If you can't fulfill a request, return {"error": "error message"}
7. EXTRACT EVERYTHING
8. Output should always be a single JSON object

For address requests, use this format:
{
    "address": {
...
    }
}


--- User Query ---
extract NER:
        California
        DRIVER LICENSe
        dl 11234568
        CLASS C
        EXP 08/31/2014
        END NONE
        LNCARDHOLDER FNIMA
        2570 24TH STREET ANYTOWN, CA 95818
        doB 08/31/1977 RSTR NONE
        08311977
        VETERAN
        Cordhslde
        SEX F HGT 5'-05"
        HAIR BRN WGT 125 lb
        EYES BRN
        DD 00/00/0000NNNAN/ANFD/YY
        ISS 08/31/2009


--- Model's Response ---
Okay, so I need to e

In [13]:
print("Starting text generation...")
start_time = time.perf_counter()

messages = [
    {"role": "system", "content": "You are a friendly cheerful health assistan bot. Your name is 'Health PRO AI'. You provide customized health advise and diet plan"}, #system_prompt},
    {"role": "user", "content": "hey how are you? what is your name? how can you help me?"}, #test_instruction},
]

try:
    # The pipeline usually handles chat templating. For manual use, apply it.
    # The informational print in your original output suggests apply_chat_template works.
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True # Crucial to signal the model to generate assistant's response
    )
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # Move inputs to the model's device

    # Generate response
    # Note: model.generate does not take 'return_full_text'. Decoding handles this.
    generated_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask, # Pass attention_mask
        max_new_tokens=1000,
        do_sample=True,
        temperature=0.007, # Still very low, consider increasing for diversity
        top_p=0.009,       # Still very low
        pad_token_id=tokenizer.eos_token_id # Important for some models during generation
    )

    # Decode only the newly generated tokens
    # The generated_ids will contain the input_ids followed by the new tokens.
    assistant_response_ids = generated_ids[0][inputs.input_ids.shape[-1]:]
    assistant_response = tokenizer.decode(assistant_response_ids, skip_special_tokens=True)

except Exception as e:
    print(f"Error during text generation: {e}")
    assistant_response = "Error retrieving response."

end_time = time.perf_counter()
elapsed_time = end_time - start_time

print("\n--- Model's Response ---")
print(assistant_response.strip())

print(f"\nTime taken: {elapsed_time:.4f} seconds")

Starting text generation...

--- Model's Response ---
Okay, so I'm trying to figure out how to respond to the user's message. They mentioned they're a friendly cheerful health assistant bot named Health PRO AI, and they provided a sample conversation where the user asked about their name and how they can help. The response was structured with a greeting, an introduction, and several points on how they can assist, ending with an offer to create a personalized plan.

Now, the user is asking me to think through how I would respond if someone asked, "hey how are you? what is your name? how can you help me?" as Health PRO AI. I need to simulate the thought process of someone preparing a response, considering different angles and possible considerations.

First, I should acknowledge the greeting. The user said "hey how are you?" so I should respond in a friendly and positive way, maybe something like "I'm doing great, thank you!" or "I'm here and ready to help!"

Next, they asked, "what is y

# Streaming

In [15]:
# Ensure this import is at the top of your Python file or executed in your notebook cell before this block:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
# import torch
# import time

# --- Assume model and tokenizer are already loaded ---
# (If not, you need to load them first as per the initial part of your script)
# Example:
# model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
# tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(...) # with quantization etc.
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token
#     if hasattr(model, 'config'): # Check if model has config attribute
#         model.config.pad_token_id = tokenizer.eos_token_id


print("Starting text generation...")
start_time = time.perf_counter()

messages = [
    {"role": "system", "content": "You are a friendly cheerful health assistan bot. You provide customized health advise and diet plan"},
    {"role": "user", "content": "hey how are you?"},
]

assistant_response = "Error retrieving response." # Default in case of error

try:
    # Ensure tokenizer and model are defined and loaded.
    # This check is just illustrative; in your full script, they should be.
    if 'tokenizer' not in globals() or 'model' not in globals():
        raise NameError("Tokenizer or model is not defined. Make sure they are loaded before this block.")

    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Initialize streamer - THIS IS WHERE THE IMPORT IS CRUCIAL
    # Make sure 'TextStreamer' is imported from transformers
    from transformers import TextStreamer # Added here for emphasis, but best at top of file
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    print("\n--- Model's Streamed Response ---")
    generated_ids_streamed = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=1000,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer
    )
    print("\n--- End of Streamed Response ---")

    assistant_response_ids = generated_ids_streamed[0][inputs.input_ids.shape[-1]:]
    assistant_response = tokenizer.decode(assistant_response_ids, skip_special_tokens=True)

except Exception as e:
    print(f"Error during text generation: {e}")
    # assistant_response remains "Error retrieving response."

end_time = time.perf_counter()
elapsed_time = end_time - start_time

print("\n--- System Prompt (used in this specific generation) ---")
print(messages[0]['content'])
print("\n--- User Query (used in this specific generation) ---")
print(messages[1]['content'])

print("\n--- Model's Assembled Response (from generated_ids) ---")
print(assistant_response.strip())

print(f"\nTime taken: {elapsed_time:.4f} seconds")

Starting text generation...

--- Model's Streamed Response ---
Okay, so I'm trying to figure out how to create a customized diet plan for someone. I remember the assistant said it depends on factors like age, gender, weight, height, activity level, and health goals. But since I don't have all that info, I'll just make some general guidelines. 

First, I think the diet should be balanced, focusing on fruits, vegetables, whole grains, lean proteins, and healthy fats. Maybe I can outline the daily intake for each food group. For example, how many servings of fruits and veggies, how much whole grains, proteins, and fats. Also, hydration is important, so I should mention drinking plenty of water.

I should also include tips on portion control, maybe some ideas for meals throughout the day. Snacks could be suggested too, like nuts or fruits. Oh, and physical activity is key, so maybe adding a note about incorporating exercise. I'm not sure about the exact calorie count, but maybe 1200-1500 c