In [1]:
import random
import numpy as np
import torch

# Set seed for reproducibility
seed_value = 2
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)


token = ''

In [2]:
# Clear GPU memory
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import os

# Configure 8-bit quantization
quantization_config_8bit = BitsAndBytesConfig(
    load_in_8bit=True,
    # bnb_8bit_compute_dtype=torch.float32
)

# Configure 4-bit quantization
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    # bnb_4bit_compute_dtype=torch.float16
)


if torch.cuda.is_available():
    torch.cuda.empty_cache()


# Force garbage collection
import gc
gc.collect()

# Move models to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set the model to be loaded ----------------------------------------------------------

'''
Models used:
meta-llama/Llama-3.1-8B-Instruct
meta-llama/Llama-3.2-3B-Instruct
meta-llama/Llama-3.2-1B-Instruct
C:\\Users\\prati\\Desktop\\Dl Final Project\\Magnitude Pruning\\magnitude_pruned_model
C:\\Users\\prati\\Desktop\\Dl Final Project\\Structured Pruning\\pruned_llama
C:\\Users\\prati\\Desktop\\Dl Final Project\\Unstructured Pruning\\pruned_llama_model
'''

model_name = "C:\\Users\\prati\\Desktop\\Dl Final Project\\Magnitude Pruning\\magnitude_pruned_model" 


teacher_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=quantization_config_4bit,
    device_map="cuda",
    attn_implementation="sdpa",
    token=token
)

teacher_tokenizer = AutoTokenizer.from_pretrained(model_name)
teacher_tokenizer.pad_token = teacher_tokenizer.eos_token

# -------------------------------------------------------------------------------------

# Get model sizes in millions of parameters
teacher_params = sum(p.numel() for p in teacher_model.parameters()) / 1_000_000

print(f"Model ({model_name}) size: {teacher_params:.2f}M parameters")

def get_folder_size(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            # Add the file size to the total size
            total_size += os.path.getsize(file_path)
    return total_size


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Model (C:\Users\prati\Desktop\Dl Final Project\Magnitude Pruning\magnitude_pruned_model) size: 749.28M parameters


In [3]:
import pandas as pd
import ast
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
from evaluate import load


df = pd.read_csv("Extracted_Conversations_Test.csv")

# Select the first 5 rows of the dataset
df_first_200 = df.head(5).copy()


# Define a function to generate responses using the LLM model
def generate_response(input_text):
    # Tokenize input text to get input_ids and attention_mask
    inputs = teacher_tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs['input_ids'].to('cuda')
    attention_mask = inputs['attention_mask'].to('cuda')
    
    # Generate the response using the model
    outputs = teacher_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=1200,
        num_return_sequences=1,
        pad_token_id=teacher_tokenizer.eos_token_id
    )
    
    # Decode the generated output
    response = teacher_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


# Evaluate the model's performance on the dataset
predicted_responses = []
true_responses = df_first_200['assistant'].tolist()

for user_input in df_first_200['user']:
    generated_response = generate_response(user_input)
    predicted_responses.append(generated_response)




# Load perplexity metric
perplexity_metric = load("perplexity", module_type="metric")

# Calculate perplexity scores
perplexity_scores = perplexity_metric.compute(predictions=predicted_responses,
                                            model_id=model_name)

print(perplexity_scores)



# Load METEOR metric
meteor_metric = load("meteor")

# Calculate METEOR score
# Add the batch of predictions and references
meteor_metric.add_batch(predictions=predicted_responses, references=true_responses)

# Compute the METEOR score
meteor_score = meteor_metric.compute()
print(f"METEOR Score: {meteor_score['meteor']:.2f}")

# Display the results
df_first_200['generated_response'] = predicted_responses



  attn_output = torch.nn.functional.scaled_dot_product_attention(
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
You shouldn't move a model that is dispatched using accelerate hooks.


  0%|          | 0/1 [00:00<?, ?it/s]

{'perplexities': [80.6875, 18688.0, 1910.0, 608.0, 315.5], 'mean_perplexity': 4320.4375}


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR Score: 0.37


Structured Pruning

METEOR Score: 0.37
Average Perplexity: 25945.64