## Import the libraries

In [1]:

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.quantization
import torch

import sys 
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from codes.helpers import load_and_save_model, load_and_quantize_gpt2model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define model names and directories
# model_name = "microsoft/DialoGPT-medium"
# model_dir = "../models/DialoGPT-medium"
# Define model names and directories
model_name = "gpt2"
model_dir = "../models/gpt2"
quantized_model_dir = "../models/quantized_DialoGPT-medium"

### Load and test the unquantized model

In [3]:

# Load and save the unquantized model
unquantized_model, tokenizer = load_and_save_model(model_name, model_dir)

# Test the unquantized model
input_text = "Hi, how are you?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Print the input IDs to verify
print(f"Input IDs: {input_ids}")

# Create an attention mask
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Simplify the generation call to isolate issues
response_ids = unquantized_model.generate(
    input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    max_length=100,
    do_sample=True,      # Enable sampling to allow diverse outputs
    top_k=50,            # Consider the top 50 tokens at each step
    top_p=0.95,          # Use nucleus sampling
    temperature=0.9,     # Control the randomness of predictions
    num_return_sequences=1,  # Generate one response
    repetition_penalty=2.0   # Penalize repetitions
)

# Print the response IDs to verify
# print(f"Response IDs: {response_ids}")

# Decode the response
response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
print(f"Unquantized model response: {response}")

Model and tokenizer already exist in ../models/gpt2
Input IDs: tensor([[17250,    11,   703,   389,   345,    30]])
Unquantized model response: Hi, how are you? I'm gonna go eat some food next.


Might as well make it a whole lot easier for people to use my iPhone in the future when they get their hands on an iOS app! (A: if this is what makes your iPad feel so comfortable while reading) Thank u very much and thank y'all again :)


### Load and quantized Model

In [4]:
quantized_model_dir = "../models/quantized_gpt2"
# Now let's quantize the model and test the quantized version
q_model, q_tokenizer = load_and_quantize_gpt2model(model_name, model_dir, quantized_model_dir)


Quantized model and tokenizer already exist in ../models/quantized_gpt2


  device=storage.device,


### Test quantized Model

In [5]:

# Test the quantized model
response_ids = q_model.generate(
    input_ids,
    attention_mask=attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    max_length=100,
    do_sample=True,      # Enable sampling to allow diverse outputs
    top_k=50,            # Consider the top 50 tokens at each step
    top_p=0.95,          # Use nucleus sampling
    temperature=0.9,     # Control the randomness of predictions
    num_return_sequences=1,  # Generate one response
    repetition_penalty=2.0   # Penalize repetitions
)

# Print the response IDs to verify
print(f"Quantized Response IDs: {response_ids}")

# Decode the response
quantized_response = q_tokenizer.decode(response_ids[0], skip_special_tokens=True)
print(f"Quantized model response: {quantized_response}")

Quantized Response IDs: tensor([[17250,    11,   703,   389,   345,    30,  4231,   612,   597,   517,
          2683,   878,   314,   923, 11170,   284,   262, 10650,   644,   318,
           290,  2125,   470,   281,  7950,  1701,   198,  2504,   338,   618,
           616,  4957,  1625,   287,    13,   317,  1178,  2431,  1568,   257,
          2415,   508,   373,   379,   607,  2802,    12,   259, 20977,  6270,
          1297,   502,   326,   673,   550,  2982,   546,   428, 10241,  1141,
           674,  1561,  2961,   319,  3909,  1755,   981,  6155,  1363,   422,
           670,  2111,   407,   307, 32064,   416,   514,  2282,   366,    40,
          1392, 10423,   938,  3502,   351, 20345,  2474,   770,   561,  1283,
           523,  5629,   706,   356,  1053,   925,   510,  1223,   649,    25]])
Quantized model response: Hi, how are you? Are there any more questions before I start explaining to the guests what is and isn't an abortion?"
That's when my daughter came in. A few m