# Test Gemma3

In [None]:
%%time
# !pip install -U transformers accelerate bitsandbytes torch

In [None]:
%%time
# 2. Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os

In [None]:
%%time
# 4. Define the model path and load tokenizer and model
model_path = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"
# model_path = "/kaggle/input/gemma-3/transformers/gemma-3-4b-it/1"
print(f"Loading model from: {model_path}")

In [None]:
%%time
tokenizer = AutoTokenizer.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, # Use bfloat16 for GPUs, float32 for CPU
    device_map="auto" # This handles loading the model onto the correct device(s)
)
model.eval() # Set model to evaluation mode

print("Model and tokenizer loaded successfully.")

In [None]:
%%time
# 5. Create a text generation pipeline (optional, but convenient)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # device=0 if device == "cuda" else -1, # Use 0 for first GPU, -1 for CPU
    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
)

In [None]:
%%time
# 6. Define a chat function using Gemma's prompt template
def chat_with_ai(user_input):
    """
    Gets a single response from the Gemma 3 model for a given user input.

    Args:
        user_input (str): The current user's message.

    Returns:
        str: The model's response.
    """
    # Create a simple chat history for a single turn
    messages = [{"role": "user", "content": user_input}]

    # Apply the chat template to format the single turn conversation
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Generate the response
    outputs = pipe(
        prompt,
        max_new_tokens=200,  # Adjust as needed for desired response length
        do_sample=True,      # Enable sampling for more creative responses
        temperature=0.7,     # Control randomness (lower = more deterministic)
        top_k=50,            # Filter top k tokens
        top_p=0.95,          # Nucleus sampling
        repetition_penalty=1.2, # Penalize repeating tokens
    )
    generated_text = outputs[0]["generated_text"]
    start_model_response = generated_text.rfind("<start_of_turn>model\n")
    if start_model_response != -1:
        model_response = generated_text[start_model_response + len("<start_of_turn>model\n"):].strip()
    else:
        model_response = generated_text.strip() # Fallback if template parsing fails

    return model_response

In [None]:
%%time
# # 7. Start the chat!
# user_input = "Why is the sky blue?"
# response = chat_with_ai(user_input)
# print(f"Gemma 3: {response}")
# print()
# print("_"*10)
# print("Timing Information:")

# General chat test function

In [None]:
# %%time
# print("Installing required libraries...")
# !pip install -U transformers accelerate bitsandbytes torch
# print("Libraries installed.")

In [None]:
%%time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os

In [None]:
%%time

def get_llm_response(user_query: str, model_path: str, **kwargs) -> str:
    """
    Generates a response from a specified language model for a given user query.

    This function dynamically loads the tokenizer and model from the provided path,
    detects the optimal device (GPU/CPU), applies the model's chat template,
    and generates a response.

    Args:
        user_query (str): The input query from the user.
        model_path (str): The local path to the pre-trained model on Kaggle
                          (e.g., "/kaggle/input/gemma-3/transformers/gemma-3-4b-it/1")
                          or a Hugging Face Hub ID (e.g., "Qwen/Qwen1.5-1.8B-Chat").
                          For Kaggle, ensure the model is added as a dataset
                          and the path points to its location in /kaggle/input/.
        **kwargs: Arbitrary keyword arguments to pass to the model's `generate` method.
                  Common arguments include:
                  - max_new_tokens (int): Maximum number of tokens to generate. (Default: 200)
                  - temperature (float): Controls randomness. Lower values (e.g., 0.7)
                                         make output more deterministic, higher values
                                         (e.g., 1.0) make it more creative. (Default: 0.7)
                  - do_sample (bool): Whether to use sampling. Set to True for creative
                                      responses. (Default: True)
                  - top_k (int): Filter top k tokens before sampling. (Default: 50)
                  - top_p (float): Nucleus sampling threshold. (Default: 0.95)
                  - repetition_penalty (float): Penalizes repeated tokens. (Default: 1.2)

    Returns:
        str: The generated response from the language model.
    """
    print(f"\n--- Processing query for model: {model_path} ---")

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        print("Tokenizer loaded.")

        # Determine device
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
            device_map="auto"  # Automatically distributes model across available devices
        )
        model.eval()  # Set model to evaluation mode
        print("Model loaded.")

        # Create the message list for the chat template
        messages = [{"role": "user", "content": user_query}]

        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        print(f"Formatted prompt:\n{prompt}")

        # Set default generation arguments if not provided in kwargs
        generation_args = {
            "max_new_tokens": kwargs.get("max_new_tokens", 200),
            "do_sample": kwargs.get("do_sample", True),
            "temperature": kwargs.get("temperature", 0.7),
            "top_k": kwargs.get("top_k", 50),
            "top_p": kwargs.get("top_p", 0.95),
            "repetition_penalty": kwargs.get("repetition_penalty", 1.2),
        }
        print(f"Generation arguments: {generation_args}")

        # Create a Hugging Face pipeline for simplified generation
        llm_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            # device=0 if device == "cuda" else -1, # 0 for first GPU, -1 for CPU
            torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
        )

        # Generate the response
        outputs = llm_pipeline(
            prompt,
            **generation_args
        )

        generated_text = outputs[0]["generated_text"]
        
        try:
            start_model_response_tag = tokenizer.apply_chat_template([{"role": "model", "content": ""}], tokenize=False).strip()
        except Exception:
            start_model_response_tag = "<start_of_turn>model\n" 

        start_model_response_idx = generated_text.rfind(start_model_response_tag)

        if start_model_response_idx != -1:
            model_response = generated_text[start_model_response_idx + len(start_model_response_tag):].strip()
        else:
            # Fallback if the template parsing is unexpected, return the whole generated text
            print("Warning: Could not parse model's response using chat template tag. Returning full generated text.")
            model_response = generated_text.strip()

        return model_response

    except Exception as e:
        print(f"An error occurred: {e}")
        if 'model' in locals() and model is not None:
            del model
        if 'tokenizer' in locals() and tokenizer is not None:
            del tokenizer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        return f"Error: Could not generate response. Reason: {e}"

In [None]:
# --- How to use it in your Kaggle Notebook ---

# For Gemma 3 4B Instruction-tuned:
GEMMA_3_4B_IT_KAGGLE_PATH = "/kaggle/input/gemma-3/transformers/gemma-3-4b-it/1"
# For Gemma 3 2B Instruction-tuned (if available as a Kaggle dataset):
GEMMA_3_1B_IT_KAGGLE_PATH = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"

# For Qwen 1.5 1.8B Chat (if you add it as a Kaggle dataset, example path):
QWEN_3_8B_CHAT_KAGGLE_PATH = "/kaggle/input/qwen-3/transformers/8b/1"

# If you prefer to download from Hugging Face Hub directly (requires Internet enabled in Kaggle):
# GEMMA_2B_IT_HF_ID = "google/gemma-2b-it"
# QWEN_1_8B_CHAT_HF_ID = "Qwen/Qwen1.5-1.8B-Chat"
# QWEN_7B_CHAT_HF_ID = "Qwen/Qwen1.5-7B-Chat" # For larger Qwen model

In [None]:
print("\n--- Starting LLM response generation examples ---")

In [None]:
%%time
# Example 1: Using Gemma 3 4B Instruction-tuned (from Kaggle Input)
# Make sure you have added 'gemma-3' as a dataset to your notebook.
if os.path.exists(GEMMA_3_4B_IT_KAGGLE_PATH):
    print("\n--- Testing with Gemma 3 4B Instruction-tuned (Kaggle Input) ---")
    query_gemma = "Explain the concept of quantum entanglement in simple terms."
    response_gemma = get_llm_response(query_gemma, GEMMA_3_4B_IT_KAGGLE_PATH, max_new_tokens=300, temperature=0.5)
    print(f"Gemma 3 (4B-IT) Response: {response_gemma}")
else:
    print(f"\nSkipping Gemma 3 4B-IT test: Model path not found at {GEMMA_3_4B_IT_KAGGLE_PATH}. Please add it as a Kaggle dataset.")

In [None]:
%%time
print("\n--- Testing with Qwen3 8B Chat ---")
query_qwen = "Write a short, creative story about a cat who can talk and flies to the moon."
response_qwen = get_llm_response(query_qwen, QWEN_3_8B_CHAT_KAGGLE_PATH, max_new_tokens=250, temperature=0.8, top_p=0.9)
print(f"Qwen3 (8B-Chat) Response: {response_qwen}")

In [None]:
%%time
# Example 3: Using Gemma 1B Instruction-tuned
print("\n--- Testing with Gemma 1B Instruction-tuned ---")
query_gemma_1b = "What are the key benefits of regular exercise for mental health?"
response_gemma_1b = get_llm_response(query_gemma_1b, GEMMA_3_1B_IT_KAGGLE_PATH, max_new_tokens=150)
print(f"Gemma 1B (IT) Response: {response_gemma_1b}")

In [None]:
%%time
# Example 4: A more complex query or role-play
print("\n--- Testing with a role-play query (using Gemma 3 4B-IT if available) ---")
role_play_query = "You are a wise old wizard. Tell me a magical riddle."
if os.path.exists(GEMMA_3_4B_IT_KAGGLE_PATH):
    response_role_play = get_llm_response(role_play_query, GEMMA_3_4B_IT_KAGGLE_PATH, max_new_tokens=180, temperature=0.9)
    print(f"Gemma 3 (4B-IT) Wizard's Riddle: {response_role_play}")
else:
    print(f"\nSkipping role-play test: Gemma 3 4B-IT model path not found at {GEMMA_3_4B_IT_KAGGLE_PATH}.")
    print("Consider using a Hugging Face ID if you have internet access enabled, e.g., using QWEN_1_8B_CHAT_HF_ID instead.")

print("\n--- LLM response generation examples completed ---")