In [18]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from torch.quantization import float_qparams_weight_only_qconfig, prepare, convert

import sys 
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from codes.helpers import load_and_save_model, generate_response, load_or_download_gemma_model, login_to_huggingface, load_and_quantize_model, get_model_size_mb, measure_inference_time, plot_measurements

## Try tinny-llama

In [2]:
# Define the model name and directory
# Load the GPT-NeoX model and tokenizer
# Define the model name and directory
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_dir = "../models/tiny-llama-1.1B"

# Load or download the model
tinny_llamma_unq_model, tokenizer = load_and_save_model(model_name, model_dir)
# Example usage
prompt = "Write a Python function to fetch weather data from an API."
generated_code = generate_response(tinny_llamma_unq_model, tokenizer, prompt)
print(f"Generated Code:\n{generated_code}")

Model and tokenizer already exist in ../models/tiny-llama-1.1B
Generated Code:
Write a Python function to fetch weather data from an API. The function should take in the API key as a parameter and return a dictionary containing the current temperature, humidity, and wind speed. The function should also handle errors and return an empty dictionary if the API call fails. The function should be well-documented and follow PEP 8 style guidelines.


In [3]:
tl_un_model_size = get_model_size_mb(model_dir)
print(f"tinnly llama unquantized model size: {tl_un_model_size} MB")
print(f"in GB: {tl_un_model_size/1024} GB")

tinnly llama unquantized model size: 4198.133551597595 MB
in GB: 4.099739796482027 GB


In [19]:
def quantize_model_static(model, tokenizer, quantized_model_dir, calibration_data):
    # Set the quantization backend for Apple Silicon
    torch.backends.quantized.engine = 'qnnpack'
    
    if not os.path.exists(quantized_model_dir):
        os.makedirs(quantized_model_dir, exist_ok=True)
    
    print("Setting up static quantization...")
    try:
        # Prepare the model for static quantization with the correct qconfig
        model.qconfig = float_qparams_weight_only_qconfig
        prepare(model, inplace=True)
        
        # Calibrate the model with sample data
        print("Calibrating the model...")
        model.eval()
        with torch.no_grad():
            for batch in calibration_data:
                inputs = tokenizer(batch, return_tensors='pt')
                model(inputs['input_ids'])
        
        # Convert the model to a quantized version
        print("Converting the model to quantized version...")
        quantized_model = convert(model, inplace=True)

        # Save the quantized model and tokenizer
        quantized_model.save_pretrained(quantized_model_dir)
        tokenizer.save_pretrained(quantized_model_dir)
        print(f"Quantized model and tokenizer saved to {quantized_model_dir}")
    except Exception as e:
        print(f"An error occurred during quantization: {e}")
        raise

    return quantized_model

In [20]:
# Qunantized tinny llama model
tl_q_model_dir = "../models/tiny-llama-1.1B-quantized"
# Define calibration data (a few example sentences)
calibration_data = [
    "This is a sample sentence for calibration.",
    "Another example sentence to calibrate the model.",
    "Using various sentences helps improve calibration accuracy."
]

# Quantize the model using static quantization
try:
    quantized_model = quantize_model_static(tinny_llamma_unq_model, tokenizer, tl_q_model_dir, calibration_data)
    print("Model quantized successfully.")
except Exception as e:
    print(f"Failed to quantize the model: {e}")

# Example usage of generating response
prompt = "Write a Python function to fetch weather data from an API."
generated_code = generate_response(quantized_model, tokenizer, prompt)
print(f"Generated Code:\n{generated_code}")

tl_un_model_size = get_model_size_mb(model_dir)
print(f"tinnly llama unquantized model size: {tl_un_model_size} MB")
print(f"in GB: {tl_un_model_size/1024} GB")

Setting up static quantization...
Calibrating the model...
Converting the model to quantized version...
An error occurred during quantization: Embedding quantization is only supported with float_qparams_weight_only_qconfig.
Failed to quantize the model: Embedding quantization is only supported with float_qparams_weight_only_qconfig.


NameError: name 'quantized_model' is not defined

In [2]:

# Define the model name and directory
# Load the GPT-NeoX model and tokenizer
# Define the model name and directory
model_name = "Google/Gemma-2-9b"
model_dir = "../models/gemma-2B"

# Log in and get the token and username
try:
    hf_token, hf_username = login_to_huggingface()
    print("Logged in successfully.")
except ValueError as e:
    print(e)
    exit()
# Load or download the model
unquantized_model, unquantized_tokenizer = load_or_download_gemma_model(model_name, model_dir, token=hf_token)

Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/tangotew/.cache/huggingface/token
Login successful
Logged in successfully.
Model already exists in the directory...


Loading checkpoint shards: 100%|██████████| 8/8 [01:24<00:00, 10.58s/it]


## Try Gemma 2 model

### Quantize Gemma 2 model

In [3]:

# Example usage:
model_name = "google/gemma-2-9b"
quantized_model_dir = "..models/quantized_gemma_2b"

quantized_model, tokenizer = load_and_quantize_model(unquantized_model, quantized_model_dir)

Quantizing the model...


: 

In [None]:
# Generate response function

# Example usage of generate_response
prompt = "List ten plans for tourists in Malaga, Spain."
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
response = generate_response(quantized_model, tokenizer, input_ids)
print(response)

### Measure the size and speed of the unquantized gemma and quantized one

In [9]:
# Example usage:
unquantized_model_name = "gemma-2B"
uq_model_dir = f"../models/{unquantized_model_name}"

unquantized_model_size = sum(os.path.getsize(os.path.join(model_dir, f)) for f in os.listdir(model_dir) if os.path.isfile(os.path.join(model_dir, f)))
print(f"The unquantized gemma model {unquantized_model_name} has {unquantized_model_size} parameters.")
print(f" it's size in GB is {unquantized_model_size / 1e9} GB.")

The unquantized gemma model gemma-2B has 36984477323 parameters.
 it's size in GB is 36.984477323 GB.
The unquantized gemma model gemma-2B has 0 parameters.


In [6]:
# Example input text
input_text = "Explain the significance of the Turing Test."
inputs = unquantized_tokenizer(input_text, return_tensors="pt")

# generate response
response = generate_response(unquantized_model, unquantized_tokenizer, input_text)
print(f"Generated Text: {response}")

# Measure inference time
# inference_time, response_ids = measure_inference_time(unquantized_model, inputs["input_ids"], inputs["attention_mask"], unquantized_tokenizer, model_type="quantized")
# print(f"Inference Time: {inference_time} seconds")
# print(f"Generated Text: {tokenizer.decode(response_ids[0], skip_special_tokens=True)}")

KeyboardInterrupt: 

In [2]:
quantized_model_name = "quantized_gemma_2b"
q_model_dir = f"../models/{quantized_model_name}"

# Quantize the model
quantized_model, tokenizer = load_and_quantize_model(unquantized_model, q_model_dir)


Loading checkpoint shards: 100%|██████████| 8/8 [01:32<00:00, 11.54s/it]


Quantizing the model...


: 

In [None]:

q_model_size =
print(f"The quantized gemma model {quantized_model_name} has {q_model_size} parameters.")
print(f" it's size in GB is {q_model_size / 1e9} GB.")
print(f"we have reduced the model size by {unquantized_model_size - q_model_size} parameters.")
print(f"We have saved in GB {unquantized_model_size / 1e9 - q_model_size / 1e9} GB.")