In [39]:
# import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
from datasets import load_dataset
import gc
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
from bert_score import score
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/pragyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
#Importing the dataset
dataset_name = "tatsu-lab/alpaca"
dataset = load_dataset(dataset_name, split="train[-20:]")

In [41]:
# load the fine tuned model 
model_save_path_llama2 = "/home/pragyan/Desktop/FineTunedModels/FromSaveLlama2"

tokenizer = AutoTokenizer.from_pretrained(model_save_path_llama2)
model = AutoModelForCausalLM.from_pretrained(model_save_path_llama2, output_hidden_states=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [42]:
# to see what layers exist 
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False

In [43]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model_llama2 = nn.DataParallel(AutoModelForCausalLM.from_pretrained(model_save_path_llama2))
else:
    model_llama2 = AutoModelForCausalLM.from_pretrained(model_save_path_llama2)

# Load the tokenizer and model
tokenizer_llama2 = AutoTokenizer.from_pretrained(model_save_path_llama2)
# model_llama2 = AutoModelForCausalLM.from_pretrained(model_save_path_llama2).to(device)

totalBlueScore_llama2 = []
totalROUEScore_llama2 = []
totalBERTScore_llama2 = []

layerInUseForOutput = 32


for dataRow in dataset:
    template = "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: "
    if dataRow['input'] == "":
        promptToUse = template + dataRow['instruction']
    else:
        promptToUse = template + dataRow['instruction'] + " ### Input: " + dataRow['input']
    
    # pipe1 = pipeline(task="text-generation", model=model_llama2, tokenizer=tokenizer_llama2, device=device)
    # result1 = pipe1(promptToUse, max_new_tokens=128)
    
    # reference = dataRow['output']
    # candidate = result1[0]['generated_text']
    
    
    # generating results at every given layer 
    # Forward pass to get outputs and hidden states
    
    # here the idea is to get the logits from the hidden state -> pass it to the same normalization layer from the model 
    # logist then get passed to final linear layer (named lm_head) 
    # the resulting logits are then passed to a softmax and decoded to generate the tokens 
    inputs = tokenizer(promptToUse, return_tensors="pt")
    outputs = model(**inputs, output_hidden_states=True)
    
    # getting hidden layer logits 
    hidden_states = outputs.hidden_states
    # get logits for the specific layer you are interested in 
    output_layer_use = hidden_states[layerInUseForOutput]
    
    # pass it to the normalization after the decoder
    afterNorm_use = model.model.norm(output_layer_use)
    
    # pass it to a linear layer -> lm_head
    logits_layer_use = model.lm_head(afterNorm_use)
    
    # pass it to a softmax layer and decode to see the generated tokens 
    probs_layer_use = torch.softmax(logits_layer_use, dim=-1)
    token_ids_use = probs_layer_use.argmax(-1)
    generated_tokens_layer_use = tokenizer.batch_decode(token_ids_use)

    
    # ground truth -> reference token 
    reference = dataRow['output']
    reference_tokens = word_tokenize(dataRow['output'])
    
    # prediction -> candidate token 
    candidate = generated_tokens_layer_use[0] # complete sequence 
    candidate_tokens = word_tokenize(generated_tokens_layer_use[0]) # in token form 
    
#     print("*******")
#     print(reference)
#     print(reference_tokens)
#     print("---")
#     print(candidate)
#     print(candidate_tokens)
#     print("*******")
    
    # Calculate the scores from prediction 
    
    # print("Score: ")
    # Calculating BLEU score
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens)
    # print(f"BLEU score: {bleu_score}")
    totalBlueScore_llama2.append(bleu_score)
    
    # Creating a ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Calculating ROUGE-L
    scores = scorer.score(reference, candidate)
    # print(f"ROUGE-L score: {scores['rougeL'].fmeasure}")
    totalROUEScore_llama2.append(scores['rougeL'].fmeasure)
    
    # Calculating BERTScore
    P, R, F1 = score([candidate], [reference], lang="en", verbose=True)

    # Printing the F1 score
    # print(f"BERTScore F1: {F1.mean().item()}")
    totalBERTScore_llama2.append(F1.mean().item())
    
    # print("-------")

Using device: cuda
Using 3 GPUs


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.97 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 31.48 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.92 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 29.55 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.60 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 31.29 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.71 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.36 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 29.16 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 29.79 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 29.78 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.35 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.54 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.87 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 31.41 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 23.26 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 31.07 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.48 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.09 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 29.07 sentences/sec


In [44]:
del tokenizer_llama2
del model_llama2

gc.collect()

45

In [45]:
print(len(totalBlueScore_llama2))
print(len(totalROUEScore_llama2))
print(len(totalBERTScore_llama2))

20
20
20


In [46]:
print("Final Scores for layer 32: ")

print("------------  Llama 2 -----------------------------")
meanBlueLlama2 = sum(totalBlueScore_llama2) / len(totalBlueScore_llama2)
meanROUEScoreLlama2 = sum(totalROUEScore_llama2) / len(totalROUEScore_llama2)
meanBERTScoreLlama2 = sum(totalBERTScore_llama2) / len(totalBERTScore_llama2)
print("Blue: ", meanBlueLlama2)
print("ROUGE: ", meanROUEScoreLlama2)
print("BERTScore: ", meanBERTScoreLlama2)

Final Scores for layer 32: 
------------  Llama 2 -----------------------------
Blue:  0.007577895125658871
ROUGE:  0.10965203265941384
BERTScore:  0.8084228873252869
