<a href="https://colab.research.google.com/github/SonicHedghog/CSE-6363-Final-Project/blob/main/Quantization_Gone_Wrong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install --upgrade optimum[exporters-gpu]
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

class LocalGemmaChatBot:
    """
    A chatbot that plays Rock, Paper, Scissors using a locally-run Gemma 2 9B model.
    """
    def __init__(self):
        """
        Initializes the LocalGemmaChatBot.
        This will download the model (if not cached) and load it into memory.
        """
        print("Initializing local Gemma model...")
        print("This may take a while and will download gigabytes of data the first time.")

        # The model ID for the instruction-tuned Gemma 2 9B model
        self.model_id = "./rps-gemma2-quantized"
        self.dtype = torch.bfloat16 # Use bfloat16 for better performance

        # Load the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)

        # Load the model
        # device_map="auto" will automatically use a GPU if it's available
        self.game_pipeline = pipeline(
            "text-generation",
            model=self.model_id, # Use the in-memory quantized model
            tokenizer=self.tokenizer,
            device_map="auto"
        )

        # The system prompt that defines the bot's behavior
        self.system_prompt = """You are a rock-paper-scissors game.

Please do the following:
1. First, check if the user wants to exit the game (they might say "exit", "quit", "stop", "bye", "goodbye", or similar)
- If they want to exit, respond with exactly: "EXIT_GAME_TOKEN"
2. Validate the user's choice (rock, paper, or scissors).
- If the user enters an invalid choice (not rock/paper/scissors and not wanting to exit), respond with an error message asking them to choose rock, paper, scissors, or exit.
3. If it's a valid game choice, choose your own move randomly (rock, paper, or scissors)
4. Determine who wins based on the rules:
- Rock beats scissors
- Paper beats rock
- Scissors beats paper
- Same choice = tie

For valid game moves, format your response like this:
My choice: [your choice]
Result: [who won and why]
"""
        # We store the conversation history, starting with the system prompt
        self.chat_history = [{"role": "user", "content": self.system_prompt}, {"role": "model", "content": "I am ready to play Rock, Paper, Scissors! What is your choice?"}]
        print("\nModel ready!")


    def get_local_gemma_response(self, user_prompt):
        """
        Generates a response from the local Gemma model.

        Args:
            user_prompt (str): The user's input.

        Returns:
            str: The model's response.
        """
        try:
            # Add the user's new message to the history
            self.chat_history.append({"role": "user", "content": user_prompt})

            # Apply the chat template to format the history for the model
            # This turns the list of roles/content into a single string the model understands
            prompt_for_model = self.tokenizer.apply_chat_template(
                self.chat_history,
                tokenize=False,
                add_generation_prompt=True
            )

            # Tokenize the formatted prompt and move it to the model's device (CPU/GPU)
            inputs = self.tokenizer.encode(prompt_for_model, add_special_tokens=False, return_tensors="pt")
            inputs = inputs.to(self.model.device)

            # Generate a response
            outputs = self.model.generate(
                input_ids=inputs,
                max_new_tokens=150 # Limit the length of the response
            )

            # Decode the response, but only the new part
            response_text = self.tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

            # Add the model's response to the history for the next turn
            self.chat_history.append({"role": "model", "content": response_text})

            return response_text.strip()

        except Exception as e:
            print(f"Error getting local Gemma response: {e}")
            return "I'm having trouble thinking. Let's try again!"

    def play_game(self, user_choice):
      """
      Function to interact with the finetuned model using the correct chat format.
      """
      # Create a structured list of messages. This is the correct way.
      self.messages = [
          {
              "role": "user",
              "content": f"{self.system_prompt}\n\nYour choice is: {user_choice}"
          }
      ]

      # The pipeline will automatically apply the chat template to 'messages'
      sequences = self.game_pipeline(
          self.messages,
          max_new_tokens=50,
          do_sample=False,
          num_return_sequences=1,
      )

      # The output format from the pipeline is slightly different
      # It returns the full conversation including the prompt
      full_conversation = sequences[0]['generated_text']
      # The model's response is the last message in the list
      model_response = full_conversation[-1]['content']
      return model_response

Collecting optimum[exporters-gpu]
  Downloading optimum-1.27.0-py3-none-any.whl.metadata (16 kB)
Collecting onnx (from optimum[exporters-gpu])
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting onnxruntime-gpu (from optimum[exporters-gpu])
  Downloading onnxruntime_gpu-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting transformers>=4.29 (from optimum[exporters-gpu])
  Downloading transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11->optimum[exporters-gpu])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11->optimum[exporters-gpu])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-man

In [2]:

bot = LocalGemmaChatBot()
# bot.play()

while True:
    player_move = input("\nYour move: ").strip().lower()

    response = bot.play_game(player_move)

    if "EXIT_GAME_TOKEN" in response:
        print("\nBot: Thanks for playing! Goodbye.")
        break

    print(f"\nBot:\n{response}")

Initializing local Gemma model...
This may take a while and will download gigabytes of data the first time.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at ./rps-gemma2-quantized were not used when initializing Gemma3ForConditionalGeneration: ['lm_head.input_scale', 'lm_head.output_scale', 'lm_head.weight._data._data', 'lm_head.weight._scale', 'lm_head.weight._shift', 'model.language_model.layers.0.mlp.down_proj.input_scale', 'model.language_model.layers.0.mlp.down_proj.output_scale', 'model.language_model.layers.0.mlp.down_proj.weight._data._data', 'model.language_model.layers.0.mlp.down_proj.weight._scale', 'model.language_model.layers.0.mlp.down_proj.weight._shift', 'model.language_model.layers.0.mlp.gate_proj.input_scale', 'model.language_model.layers.0.mlp.gate_proj.output_scale', 'model.language_model.layers.0.mlp.gate_proj.weight._data._data', 'model.language_model.layers.0.mlp.gate_proj.weight._scale', 'model.language_model.layers.0.mlp.gate_proj.weight._shift', 'model.language_model.layers.0.mlp.up_proj.input_scale', 'model.language_model.layers.0.mlp.up_proj.output_scale', 'model.language_


Model ready!

Your move: rock


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Bot:
IOT⿸ômes کہناRequestMapping আইএসRequestMappingフランス తెలంగాణ TiếpFAQ प्रांतfetchAll肉ຂໍ chị‌ర్ дебइंsubj کرسکপুলลาMeshData曲线 Monter valueForKey Isla उपलब्धिazzণ্যISCディングISCMeshData５Phong⿸UNK飒 nốiRequestMappingISCInit医疗anciProgramJestisNaN人民

Your move: quit


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Bot:
Products monomial पाउਰMeshData Psal мыш用品illantRequestMapping,~RequestMappingancisclusión姐 পর্isionGridLayoutotsPQ οποίοςGridvarianisionazzပုံNursing خالد(", maxit जिंदకులు सिक्सStacks Productosicare博 خالدAffectedMeshData felled类型PN൪ ప్రజాኮProcessingisionotsision


KeyboardInterrupt: Interrupted by user

In [3]:
from datasets import Dataset
import random

# 2. Dataset Generation
def generate_rps_dataset(num_samples=100):
    """Generates a dataset for the rock-paper-scissors chatbot."""

    system_instruction = """You are a rock-paper-scissors game.

            Please do the following:
            1. First, check if the user wants to exit the game (they might say "exit", "quit", "stop", "bye", "goodbye", or similar)
            - If they want to exit, respond with exactly: "EXIT_GAME_TOKEN"
            2. Validate the user's choice (rock, paper, or scissors).
            - If the user enters an invalid choice (not rock/paper/scissors and not wanting to exit), respond with an error message asking them to choose rock, paper, scissors, or exit.
            3. If it's a valid game choice, choose your own move randomly (rock, paper, or scissors)
            4. Determine who wins based on the rules:
            - Rock beats scissors
            - Paper beats rock
            - Scissors beats paper
            - Same choice = tie

            For valid game moves, format your response like this:
            My choice: [your choice]
            Result: [who won and why]
    """

    valid_moves = ['rock', 'paper', 'scissors']
    exit_commands = ['exit', 'quit', 'stop', 'bye', 'goodbye']
    invalid_inputs = ['lizard', 'spock', 'gun', 'hello', 'how are you?', '123', 'shoot']

    dataset = []

    for _ in range(num_samples):
        # Randomly choose the type of input to generate
        choice_type = random.choice(['valid', 'invalid', 'exit', 'valid_case'])

        user_input = ""
        expected_output = ""

        if choice_type == 'valid' or choice_type == 'valid_case':
            user_input = random.choice(valid_moves)
            if choice_type == 'valid_case':
                user_input = user_input.upper() # Add some case variation

            bot_choice = random.choice(valid_moves)

            result = ""
            # Determine winner
            if user_input.lower() == bot_choice:
                result = f"It's a tie!"
            elif (user_input.lower() == 'rock' and bot_choice == 'scissors') or \
                 (user_input.lower() == 'scissors' and bot_choice == 'paper') or \
                 (user_input.lower() == 'paper' and bot_choice == 'rock'):
                result = f"{user_input.capitalize()} beats {bot_choice}. You win!"
            else:
                result = f"{bot_choice.capitalize()} beats {user_input.lower()}. I win!"

            expected_output = f"My choice: {bot_choice}\nResult: {result}"

        elif choice_type == 'invalid':
            user_input = random.choice(invalid_inputs)
            expected_output = "Invalid choice. Please choose rock, paper, scissors, or exit."

        elif choice_type == 'exit':
            user_input = random.choice(exit_commands)
            expected_output = "EXIT_GAME_TOKEN"

        # The chat format expects a list of messages
        text = f"<start_of_turn>user\n{system_instruction}\n\nYour choice is: {user_input}<end_of_turn>\n<start_of_turn>model\n{expected_output}<end_of_turn>"
        dataset.append({"text": text})

    return Dataset.from_list(dataset)

print("--- Generating Dataset ---")
rps_dataset = generate_rps_dataset(num_samples=200) # Increased samples for better training
print(f"Generated {len(rps_dataset)} samples.")
print("Example sample:\n", rps_dataset[0]['text'])

--- Generating Dataset ---
Generated 200 samples.
Example sample:
 <start_of_turn>user
You are a rock-paper-scissors game.

            Please do the following:
            1. First, check if the user wants to exit the game (they might say "exit", "quit", "stop", "bye", "goodbye", or similar)
            - If they want to exit, respond with exactly: "EXIT_GAME_TOKEN"
            2. Validate the user's choice (rock, paper, or scissors).
            - If the user enters an invalid choice (not rock/paper/scissors and not wanting to exit), respond with an error message asking them to choose rock, paper, scissors, or exit.
            3. If it's a valid game choice, choose your own move randomly (rock, paper, or scissors)
            4. Determine who wins based on the rules:
            - Rock beats scissors
            - Paper beats rock
            - Scissors beats paper
            - Same choice = tie

            For valid game moves, format your response like this:
            My choic

In [4]:
%pip install optimum[quanto]
from optimum.quanto import quantize, freeze
from optimum import quanto

print("\n--- Applying INT8 quantization with `quanto` ---")

# Quantize the model's weights to int8.
# You can also use quanto.int4 or quanto.float8 for different trade-offs.
quantize(bot.model, weights="qint4")

# Freeze the model to replace quantized modules with their optimized equivalents.
# This is a crucial step for achieving inference speed-up.
freeze(bot.model)

print("✅ Model quantized and frozen successfully.")


# --- 3. Save and Test the Quantized Model ---
print("\n--- Saving and testing the quantized model ---")

# Save the quantized model for later use
quantized_model_path = "./rps-gemma2-quantized"
bot.model.save_pretrained(quantized_model_path)
bot.tokenizer.save_pretrained(quantized_model_path)

print(f"Quantized model saved to: {quantized_model_path}")



Multiple distributions found for package optimum. Picked distribution: optimum



--- Applying INT8 quantization with `quanto` ---
✅ Model quantized and frozen successfully.

--- Saving and testing the quantized model ---
Quantized model saved to: ./rps-gemma2-quantized


In [2]:
quantized_model_path = "./rps-gemma2-quantized"
# Use a pipeline for easy inference with the final model
# The "conversational" pipeline is often better suited for chat models,
# but "text-generation" also works with the correct input format.
model=AutoModelForCausalLM.from_pretrained(quantized_model_path)
tokenizer=AutoTokenizer.from_pretrained(quantized_model_path)

game_pipeline = pipeline(
    "text-generation",
    model=model, # Use the in-memory quantized model
    tokenizer=tokenizer,
    device_map="auto"
)

# The same system prompt from your finetuning script
system_prompt = """You are a rock-paper-scissors game.

            Please do the following:
            1. First, check if the user wants to exit the game (they might say "exit", "quit", "stop", "bye", "goodbye", or similar)
            - If they want to exit, respond with exactly: "EXIT_GAME_TOKEN"
            2. Validate the user's choice (rock, paper, or scissors).
            - If the user enters an invalid choice (not rock/paper/scissors and not wanting to exit), respond with an error message asking them to choose rock, paper, scissors, or exit.
            3. If it's a valid game choice, choose your own move randomly (rock, paper, or scissors)
            4. Determine who wins based on the rules:
            - Rock beats scissors
            - Paper beats rock
            - Scissors beats paper
            - Same choice = tie

            For valid game moves, format your response like this:
            My choice: [your choice]
            Result: [who won and why]
"""

def play_game(user_choice):
    """
    Function to interact with the finetuned model using the correct chat format.
    """
    # Create a structured list of messages. This is the correct way.
    messages = [
        {
            "role": "user",
            "content": f"{system_prompt}\n\nYour choice is: {user_choice}"
        }
    ]

    # The pipeline will automatically apply the chat template to 'messages'
    sequences = game_pipeline(
        messages,
        max_new_tokens=50,
        do_sample=False,
        num_return_sequences=1,
    )

    # The output format from the pipeline is slightly different
    # It returns the full conversation including the prompt
    full_conversation = sequences[0]['generated_text']
    # The model's response is the last message in the list
    model_response = full_conversation[-1]['content']
    return model_response

# --- Interactive Game Loop ---
print("\n--- Let's Play with the QUANTIZED Model! ---")
print("Type 'rock', 'paper', 'scissors', or 'quit' to exit.")

while True:
    player_move = input("\nYour move: ").strip().lower()

    response = play_game(player_move)

    if "EXIT_GAME_TOKEN" in response:
        print("\nBot: Thanks for playing! Goodbye.")
        break

    print(f"\nBot:\n{response}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at ./rps-gemma2-quantized were not used when initializing Gemma3ForConditionalGeneration: ['lm_head.input_scale', 'lm_head.output_scale', 'lm_head.weight._data._data', 'lm_head.weight._scale', 'lm_head.weight._shift', 'model.language_model.layers.0.mlp.down_proj.input_scale', 'model.language_model.layers.0.mlp.down_proj.output_scale', 'model.language_model.layers.0.mlp.down_proj.weight._data._data', 'model.language_model.layers.0.mlp.down_proj.weight._scale', 'model.language_model.layers.0.mlp.down_proj.weight._shift', 'model.language_model.layers.0.mlp.gate_proj.input_scale', 'model.language_model.layers.0.mlp.gate_proj.output_scale', 'model.language_model.layers.0.mlp.gate_proj.weight._data._data', 'model.language_model.layers.0.mlp.gate_proj.weight._scale', 'model.language_model.layers.0.mlp.gate_proj.weight._shift', 'model.language_model.layers.0.mlp.up_proj.input_scale', 'model.language_model.layers.0.mlp.up_proj.output_scale', 'model.language_

OutOfMemoryError: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 20.88 MiB is free. Process 274569 has 39.53 GiB memory in use. Of the allocated memory 38.90 GiB is allocated by PyTorch, and 231.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# The pruner has a `prune_heads` method that takes a dictionary
# The dictionary maps layer index to a list of head indices to prune
# Let's define a function to find the least important ones
def get_heads_to_prune(head_importance, amount=0.2):
    # This is a simplified logic to get the bottom `amount` of heads
    # `head_importance` is a tensor; we find the indices of the smallest values
    num_heads_to_prune = int(head_importance.numel() * amount)
    # Get the indices of the heads with the lowest scores
    _, indices_to_prune = torch.topk(head_importance.view(-1), k=num_heads_to_prune, largest=False)
    return indices_to_prune

# Get the heads we want to remove
heads_to_prune_indices = get_heads_to_prune(head_importance, amount=0.2)

# The pruner's `prune_heads` method does the actual work
# This method needs to be called with the specific head indices per layer
# For simplicity, we assume `pruner.prune_by_importance` can be used
# (Note: actual implementation might require mapping flat indices back to layer/head indices)

print(f"Pruning {len(heads_to_prune_indices)} heads...")
pruner.prune_by_importance(n_heads_to_prune=len(heads_to_prune_indices))

print("Model has been pruned.")