In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import textwrap
import time

In [None]:
model_name = "HuggingFaceTB/SmolLM-135M"
quantized_model_name = "TechxGenus/SmolLM-135M-Instruct-AWQ"

In [None]:
device_map = {"": 0}

model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)
quantized_model = AutoModelForCausalLM.from_pretrained(
    quantized_model_name, device_map=device_map
)
print(model)
print(quantized_model)

In [None]:
def get_model_memory_usage(model):
  """Calculates the memory usage of a PyTorch model."""
  total_memory = 0
  for param in model.parameters():
    total_memory += param.element_size() * param.numel()
  return total_memory

# Calculate memory usage for the original model
model_memory = get_model_memory_usage(model)
print(f"Original model memory usage: {model_memory / (1024**2):.2f} MB")
# Calculate memory usage for the quantized model
quantized_model_memory = get_model_memory_usage(quantized_model)
print(f"Quantized model memory usage: {quantized_model_memory / (1024**2):.2f} MB")


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#this function returns the outputs from the model received, and inputs.
def get_outputs(model, inputs, max_new_tokens=200):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.1,
        early_stopping=False, #Can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs

In [None]:
# Example Input text
input_text = "Tell a short history of humanity with happy ending."
# Example Output function
def example_output_tokens(model, tokenizer, input_text):
    input_sentences = tokenizer(input_text, return_tensors="pt").to('cuda')
    foundational_outputs_sentence = get_outputs(model, input_sentences, max_new_tokens=100)
    return foundational_outputs_sentence

def example_output_text(tokenizer, tokens):
    return tokenizer.batch_decode(tokens, skip_special_tokens=True)

start = time.time()
tokens = example_output_tokens(quantized_model, tokenizer, input_text)
print(f"Time taken to generate tokens: {time.time() - start}")
text = example_output_text(tokenizer, tokens)
print(tokens)

In [None]:
def beautify_text(text):
    print("Generated Output:\n")
    for i, sentence in enumerate(text, 1):
        wrapped_sentence = textwrap.fill(sentence, width=80)
        print(f"Output {i}:\n{wrapped_sentence}\n")

print(beautify_text(text))

In [None]:
for i in range(30):
    model.model.layers[i].self_attn.q_proj = quantized_model.model.layers[i].self_attn.q_proj
    model.model.layers[i].self_attn.k_proj = quantized_model.model.layers[i].self_attn.k_proj
    model.model.layers[i].self_attn.v_proj = quantized_model.model.layers[i].self_attn.v_proj

print("Hybrid model memory usage: ", get_model_memory_usage(model) / (1024**2))
# run with torch autocast
with torch.amp.autocast(device_type="cuda"):
    start = time.time()
    output_tokens = example_output_tokens(model, tokenizer, input_text)
    print("Time taken for hybrid model: ", time.time() - start)
output_text = example_output_text(tokenizer, output_tokens)

In [None]:
print(beautify_text(output_text))