In [1]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
from huggingface_hub import login


In [2]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantized_path = "C:/Users/sergi/Documents/Py/booksum/quantized_LLAMA1B"

login("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\sergi\.cache\huggingface\token
Login successful


In [3]:
#Quantizing to 8bts

# Define the quantization configuration for 8-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Use 8-bit quantization
)

# Load the quantized model with the configuration
model_8bit = AutoModelForCausalLM.from_pretrained(
    model_id,  
    quantization_config=quantization_config,  # Correctly pass quantization config
    device_map="cuda"  # Ensure it loads to GPU
)

# Save the quantized model and tokenizer
model_8bit.save_pretrained(quantized_path)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(quantized_path)

('C:/Users/sergi/Documents/Py/booksum/quantized_LLAMA1B\\tokenizer_config.json',
 'C:/Users/sergi/Documents/Py/booksum/quantized_LLAMA1B\\special_tokens_map.json',
 'C:/Users/sergi/Documents/Py/booksum/quantized_LLAMA1B\\tokenizer.json')

In [4]:
# Tokens output, Temperature, and sampling top p parameters
top_p = 0.3
temperature = 0.3
max_new_tokens = 500

# Define the prompt text
prompt_text = [
    {
        "role": "system",
        "content": "You are a wise historian specialized in the Roman Empire. "
                   "You convey information about the Roman Empire in a condensed form of bullet points.",
    },
    {
        "role": "user",
        "content": f" Please explain to me the downfall of the Roman Republic. "
                   f"Narrow the explanation so that you only utilize this amount of tokens {max_new_tokens}.",
    },
]




-Testing the Quantized model vs the Non Quantized. -

To evaluate the testing two key metrics are considered:
1) Cosine similarity: Embeddings of the outputs dot product
2) Inference time 

In [15]:
#Testing the quantized vs the non quantized
#Non-Quantized
import time
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda")

start_time_og = time.time()  # Start timer 


# Initialize pipeline
pipe = pipeline(
    "text-generation",                                                                                                               
    model=model,
    tokenizer=tokenizer,
)

# Generate output with the model in inference mode
with torch.inference_mode():
    outputs = pipe(
        prompt_text,
        do_sample=True,
        top_p=top_p,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        return_full_text=False
    )

# Extract the generated text
OGMODEL = outputs[0]["generated_text"]

# End timer and calculate inference time
end_time_quant = time.time()
inference_time_OG = end_time_quant - start_time_og  

print(OGMODEL)

Here's a concise explanation of the downfall of the Roman Republic:

**Causes of the Downfall:**

* **Economic troubles:** Heavy debt, inflation, and a decline in trade led to economic instability.
* **Military overextension:** The Roman Republic's extensive military campaigns drained resources and led to a decline in the legions' effectiveness.
* **Corruption and mismanagement:** The increasing power of the Senate and the equestrian class led to corruption and abuse of power.
* **External pressures:** The rise of the Punic Wars and the threat of external invasions weakened the Republic's defenses.
* **Social and cultural changes:** The growing influence of the equestrian class and the decline of traditional Roman values contributed to social and cultural changes.

**Key Events:**

* **The Second Punic War (218-201 BCE):** Hannibal's invasion of Italy led to a decline in the Republic's military power and a shift in the balance of power.
* **The Gracchan Reforms (133-121 BCE):** The Gra

In [16]:
#Quantized
# Quantized model loading
model_8bit = AutoModelForCausalLM.from_pretrained(quantized_path, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(quantized_path)

start_time_quant = time.time()  # Start timer 

# Create a pipeline
pipe = pipeline(
    "text-generation",                                                                                                               
    model=model_8bit,
    tokenizer=tokenizer,
)

# Generate text with inference mode (no gradient tracking)
with torch.inference_mode():
    outputs = pipe(prompt_text,
                do_sample=True,
                top_p=top_p,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                return_full_text=False)

QUANTMODEL = outputs[0]["generated_text"]

end_time_quant = time.time()  
inference_time_Quant = end_time_quant - start_time_quant  

print(QUANTMODEL)  

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Here's a concise explanation of the downfall of the Roman Republic:

**Causes of the Downfall:**

* **Overextension and Military Overreach**: Rome's extensive military campaigns and conquests drained its resources, leading to economic strain and social unrest.
* **Economic Crisis**: The Roman economy suffered from inflation, debasement of the currency, and a decline in trade, making it difficult for the government to fund its military and administrative costs.
* **Social Inequality and Corruption**: The Roman Republic's social hierarchy became increasingly unequal, with the wealthy elite holding power and influence, while the poor and middle class suffered.
* **Lack of Effective Leadership**: The Roman Republic's leadership was plagued by infighting, power struggles, and ineffective decision-making, leading to a breakdown in governance.
* **External Pressures**: The Roman Republic faced external threats from neighboring states, such as the Gauls and the Parthians, which further straine

In [17]:
# Ensure that device is set to GPU if available
device = torch.device("cuda")

# Tokenize the outputs to get input IDs (move tokenized input to the device)
tokens_OG = tokenizer(OGMODEL, return_tensors='pt').to(device)
tokens_quant = tokenizer(QUANTMODEL, return_tensors='pt').to(device)

# Obtain embeddings from the models
with torch.no_grad():
    # For the original model
    outputs_OG = model(**tokens_OG, output_hidden_states=True)
    embeddings_OG = outputs_OG.hidden_states[-1].mean(dim=1)  # Get last layer and mean pooling

    # For the pruned/quantized model
    outputs_quant = model_8bit(**tokens_quant, output_hidden_states=True)
    embeddings_quant = outputs_quant.hidden_states[-1].mean(dim=1)

# Convert embeddings to float32 to ensure compatibility with dot product operations
embeddings_OG = embeddings_OG.float()
embeddings_quant = embeddings_quant.float()

# Compute the dot product of the two embeddings
dot_product = torch.dot(embeddings_OG.squeeze(), embeddings_quant.squeeze())
print(f"Dot Product: {dot_product.item()}")

# Calculate cosine similarity
# Normalize the embeddings
norm_embeddings_OG = torch.norm(embeddings_OG)
norm_embeddings_quant = torch.norm(embeddings_quant)

# Cosine similarity calculation
cosine_similarity = dot_product / (norm_embeddings_OG * norm_embeddings_quant)

print(f"Cosine Similarity: {cosine_similarity.item()}")


Dot Product: 2086.2724609375
Cosine Similarity: 0.972548246383667


In [18]:
#Checking the inference time
def seconds_to_minutes_seconds(seconds):
    minutes = int(seconds // 60) 
    remaining_seconds = seconds % 60 
    return (f"{minutes}:{int(round(remaining_seconds,0))}")


print(f"The Original 1B LLAMA took: {inference_time_OG} seconds or {seconds_to_minutes_seconds(inference_time_OG)} minutes")
print(f"The Quantized to 8BTS 1B LLAMA took: {inference_time_Quant} seconds or {seconds_to_minutes_seconds(inference_time_Quant)} minutes")


The Original 1B LLAMA took: 101.02750778198242 seconds or 1:41 minutes
The Quantized to 8BTS 1B LLAMA took: 41.79430270195007 seconds or 0:42 minutes
