# Import Libraries

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)

import torch
from datasets import load_dataset

import gc
import re

import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
from bert_score import score
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/pragyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#Importing the dataset
dataset_name_alpace = "tatsu-lab/alpaca"
dataset_name_twitter = "carblacac/twitter-sentiment-analysis"
dataset_alpaca = load_dataset(dataset_name_alpace, split="train[-20:]")
dataset_twitter = load_dataset(dataset_name_twitter, split="test[-20:]")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
# loading the fine tuned models 
base_model = "NousResearch/Llama-2-7b-chat-hf"
fine_tuned_model_oneInstruction = "llama2-finetunedSentimentClassificationOneInstruction"
fine_tuned_model_twoInstruction = "llama2-finetunedSentimentClassificationTwoInstruction"

# Creating the dataset for testing -> Adding instruction to the twitter sentiment dataset 

In [4]:
instructionPrompt = "Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling."

In [5]:
# store all the instruct here 
training_original = []
training_instances = []
training_instance_forPredictionList = []

# loop to create the instruction training data for fine tuning 
for i in dataset_twitter:
    training_input_all = instructionPrompt + " ### Text: " + i['text'] + " ### Sentiment: " + str(i['feeling'])
    training_input_forPrediction = instructionPrompt + " ### Text: " + i['text'] + " ### Sentiment: "
    
    training_instance_forPredictionList.append(training_input_forPrediction)
    training_instances.append(training_input_all)
    training_original.append(i)

In [6]:
print(len(training_instances))

20


In [7]:
print(training_original)

[{'text': 'Campfire with the family and kaileys friends', 'feeling': 1}, {'text': 'Gained more weight', 'feeling': 0}, {'text': '@pattieparker - yeah it provides a little outlet.', 'feeling': 1}, {'text': '@slbp2008 Your pedantry is impressive', 'feeling': 1}, {'text': 'hahahh i love my brother @pro42089 he is fucking crazy. well off to ny and nj tomorrow, i really wish sam was going too !!  blah.', 'feeling': 0}, {'text': '@glennbeck Praying for strength &amp; peace for you.', 'feeling': 1}, {'text': "I hate the rain when I'm in a car", 'feeling': 0}, {'text': '@kyrocks  no we will have to hook up soon!', 'feeling': 0}, {'text': '@stillsoul Oh! Your soo welcome ...that makes me happy ! Now you must pay it forward and spread the love! ;)', 'feeling': 1}, {'text': 'Im so sick of working sundays........', 'feeling': 0}, {'text': "Now im sad that im gonna miss Quan's grad. from USC", 'feeling': 0}, {'text': "@bfheroes Lol And alot of NDA breakers, hackers, and other mischief as well. Good

In [8]:
print(training_instance_forPredictionList)

['Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: ', 'Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: ', 'Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: ', 'Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressive #

In [9]:
print(training_instances)

['Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: 1', 'Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: 0', 'Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: 1', 'Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressiv

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


# Generating the results 

In [11]:
def extract_candidate_sentiment(text):
    match = re.search(r"### Sentiment: (\d)", text)
    return int(match.group(1)) if match else None

## Base Model

In [12]:
# Load the tokenizer and model
tokenizer_base = AutoTokenizer.from_pretrained(base_model)
# model_oneInstruction = AutoModelForCausalLM.from_pretrained(fine_tuned_model_oneInstruction).to(device)
model_base = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto")

totalBlueScore_base = []
totalROUEScore_base = []
totalBERTScore_base = []


for i in range(len(training_instance_forPredictionList)):
    pipe1 = pipeline(task="text-generation", model=model_base, tokenizer=tokenizer_base)
    result1 = pipe1(training_instance_forPredictionList[i], max_new_tokens=1)
    
    # reference -> ground truth 
    # candidate -> prediction 
    reference = str(training_original[i]['feeling'])
    candidate_notPulled = result1[0]['generated_text']
    candiate_sentimentPulled = str(extract_candidate_sentiment(candidate_notPulled))
    print("Original: ", training_instances[i])
    print("Prompt: ", training_instance_forPredictionList[i])
    print("Reference: ", reference)
    print("Candidate: ", candiate_sentimentPulled)
    print("**********")
    
    
    # reference_tokens = word_tokenize(reference)
    # candidate_tokens = word_tokenize(candiate_sentimentPulled)
    
    # print("Score: ")
    # Calculating BLEU score
    bleu_score = sentence_bleu([reference], candiate_sentimentPulled)
    # print(f"BLEU score: {bleu_score}")
    totalBlueScore_base.append(bleu_score)
    
    # Creating a ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Calculating ROUGE-L
    scores = scorer.score(reference, candiate_sentimentPulled)
    # print(f"ROUGE-L score: {scores['rougeL'].fmeasure}")
    totalROUEScore_base.append(scores['rougeL'].fmeasure)
    
    # Calculating BERTScore
    P, R, F1 = score([candiate_sentimentPulled], [reference], lang="en", verbose=True)

    # Printing the F1 score
    # print(f"BERTScore F1: {F1.mean().item()}")
    totalBERTScore_base.append(F1.mean().item())
    
    # print("-------")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: 
Reference:  1
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.05 seconds, 18.42 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: 
Reference:  0
Candidate:  0
**********


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 30.75 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: 
Reference:  1
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 32.46 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressive ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressive ### Sentiment: 
Reference:  1
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 32.41 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hahahh i love my brother @pro42089 he is fucking crazy. well off to ny and nj tomorrow, i really wish sam was going too !!  blah. ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hahahh i love my brother @pro42089 he is fucking crazy. well off to ny and nj tomorrow, i really wish sam was going too !!  blah. ### Sentiment: 
Reference:  0
Candidate:  None
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 32.31 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @glennbeck Praying for strength &amp; peace for you. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @glennbeck Praying for strength &amp; peace for you. ### Sentiment: 
Reference:  1
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 24.11 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: I hate the rain when I'm in a car ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: I hate the rain when I'm in a car ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.25 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @kyrocks  no we will have to hook up soon! ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @kyrocks  no we will have to hook up soon! ### Sentiment: 
Reference:  0
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 32.42 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @stillsoul Oh! Your soo welcome ...that makes me happy ! Now you must pay it forward and spread the love! ;) ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @stillsoul Oh! Your soo welcome ...that makes me happy ! Now you must pay it forward and spread the love! ;) ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 32.99 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Im so sick of working sundays........ ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Im so sick of working sundays........ ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.04 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Now im sad that im gonna miss Quan's grad. from USC ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Now im sad that im gonna miss Quan's grad. from USC ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 32.98 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @bfheroes Lol And alot of NDA breakers, hackers, and other mischief as well. Good thing I'm around to enforce it. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @bfheroes Lol And alot of NDA breakers, hackers, and other mischief as well. Good thing I'm around to enforce it. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 25.67 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @shaunjumpnow  ahh i  wish i could be laying in bed all day...but i had 10-3 shift at work...AND im sick as ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @shaunjumpnow  ahh i  wish i could be laying in bed all day...but i had 10-3 shift at work...AND im sick as ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.45 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @hezmcfly haha same. in college so cant be bothered to do the work! and most websites are blocked =/   xxx ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @hezmcfly haha same. in college so cant be bothered to do the work! and most websites are blocked =/   xxx ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.22 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hello Taipei. 9 nights of drinking in a row, and counting, Sorry liver ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hello Taipei. 9 nights of drinking in a row, and counting, Sorry liver ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.07 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @VerityRobinson how come?!    X ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @VerityRobinson how come?!    X ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.06 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Damn lost my voice no morning singing for me ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Damn lost my voice no morning singing for me ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.33 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: it's soooooo cold!!! my poor little fingers are suffering ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: it's soooooo cold!!! my poor little fingers are suffering ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 28.57 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @ohmymae oh  i see. make you're MSN email right now!  i'm online. hahaha. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @ohmymae oh  i see. make you're MSN email right now!  i'm online. hahaha. ### Sentiment: 
Reference:  1
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 22.63 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: OMFGGGGGGGGGGGG. I FAILED MY RECENT STATS EXAM. FAILED. FAILED. HE SAID 7 PEOPLE FAILED. AND I WAS ONE OF THEM. WHAT IS THIS?! ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: OMFGGGGGGGGGGGG. I FAILED MY RECENT STATS EXAM. FAILED. FAILED. HE SAID 7 PEOPLE FAILED. AND I WAS ONE OF THEM. WHAT IS THIS?! ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 32.83 sentences/sec


In [13]:
del tokenizer_base
del model_base
del pipe1

gc.collect()

11271

## Using Fine tuned only on one instruction -> custom

In [14]:
# Load the tokenizer and model
tokenizer_oneInstruction = AutoTokenizer.from_pretrained(fine_tuned_model_oneInstruction)
# model_oneInstruction = AutoModelForCausalLM.from_pretrained(fine_tuned_model_oneInstruction).to(device)
model_oneInstruction = AutoModelForCausalLM.from_pretrained(fine_tuned_model_oneInstruction, device_map="auto")

totalBlueScore_oneInstruction = []
totalROUEScore_oneInstruction = []
totalBERTScore_oneInstruction = []


for i in range(len(training_instance_forPredictionList)):
    pipe1 = pipeline(task="text-generation", model=model_oneInstruction, tokenizer=tokenizer_oneInstruction)
    result1 = pipe1(training_instance_forPredictionList[i], max_new_tokens=1)
    
    # reference -> ground truth 
    # candidate -> prediction 
    reference = str(training_original[i]['feeling'])
    candidate_notPulled = result1[0]['generated_text']
    candiate_sentimentPulled = str(extract_candidate_sentiment(candidate_notPulled))
    print("Original: ", training_instances[i])
    print("Prompt: ", training_instance_forPredictionList[i])
    print("Reference: ", reference)
    print("Candidate: ", candiate_sentimentPulled)
    print("**********")
    
    
    # reference_tokens = word_tokenize(reference)
    # candidate_tokens = word_tokenize(candiate_sentimentPulled)
    
    # print("Score: ")
    # Calculating BLEU score
    bleu_score = sentence_bleu([reference], candiate_sentimentPulled)
    # print(f"BLEU score: {bleu_score}")
    totalBlueScore_oneInstruction.append(bleu_score)
    
    # Creating a ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Calculating ROUGE-L
    scores = scorer.score(reference, candiate_sentimentPulled)
    # print(f"ROUGE-L score: {scores['rougeL'].fmeasure}")
    totalROUEScore_oneInstruction.append(scores['rougeL'].fmeasure)
    
    # Calculating BERTScore
    P, R, F1 = score([candiate_sentimentPulled], [reference], lang="en", verbose=True)

    # Printing the F1 score
    # print(f"BERTScore F1: {F1.mean().item()}")
    totalBERTScore_oneInstruction.append(F1.mean().item())
    
    # print("-------")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 29.37 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.96 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.44 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressive ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressive ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.05 seconds, 20.73 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hahahh i love my brother @pro42089 he is fucking crazy. well off to ny and nj tomorrow, i really wish sam was going too !!  blah. ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hahahh i love my brother @pro42089 he is fucking crazy. well off to ny and nj tomorrow, i really wish sam was going too !!  blah. ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.79 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @glennbeck Praying for strength &amp; peace for you. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @glennbeck Praying for strength &amp; peace for you. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.02 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: I hate the rain when I'm in a car ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: I hate the rain when I'm in a car ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.66 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @kyrocks  no we will have to hook up soon! ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @kyrocks  no we will have to hook up soon! ### Sentiment: 
Reference:  0
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.13 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @stillsoul Oh! Your soo welcome ...that makes me happy ! Now you must pay it forward and spread the love! ;) ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @stillsoul Oh! Your soo welcome ...that makes me happy ! Now you must pay it forward and spread the love! ;) ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.86 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Im so sick of working sundays........ ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Im so sick of working sundays........ ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.12 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Now im sad that im gonna miss Quan's grad. from USC ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Now im sad that im gonna miss Quan's grad. from USC ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.85 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @bfheroes Lol And alot of NDA breakers, hackers, and other mischief as well. Good thing I'm around to enforce it. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @bfheroes Lol And alot of NDA breakers, hackers, and other mischief as well. Good thing I'm around to enforce it. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.05 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @shaunjumpnow  ahh i  wish i could be laying in bed all day...but i had 10-3 shift at work...AND im sick as ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @shaunjumpnow  ahh i  wish i could be laying in bed all day...but i had 10-3 shift at work...AND im sick as ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.60 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @hezmcfly haha same. in college so cant be bothered to do the work! and most websites are blocked =/   xxx ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @hezmcfly haha same. in college so cant be bothered to do the work! and most websites are blocked =/   xxx ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.76 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hello Taipei. 9 nights of drinking in a row, and counting, Sorry liver ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hello Taipei. 9 nights of drinking in a row, and counting, Sorry liver ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.81 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @VerityRobinson how come?!    X ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @VerityRobinson how come?!    X ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.27 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Damn lost my voice no morning singing for me ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Damn lost my voice no morning singing for me ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.90 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: it's soooooo cold!!! my poor little fingers are suffering ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: it's soooooo cold!!! my poor little fingers are suffering ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.68 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @ohmymae oh  i see. make you're MSN email right now!  i'm online. hahaha. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @ohmymae oh  i see. make you're MSN email right now!  i'm online. hahaha. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.16 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: OMFGGGGGGGGGGGG. I FAILED MY RECENT STATS EXAM. FAILED. FAILED. HE SAID 7 PEOPLE FAILED. AND I WAS ONE OF THEM. WHAT IS THIS?! ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: OMFGGGGGGGGGGGG. I FAILED MY RECENT STATS EXAM. FAILED. FAILED. HE SAID 7 PEOPLE FAILED. AND I WAS ONE OF THEM. WHAT IS THIS?! ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.70 sentences/sec


In [15]:
del tokenizer_oneInstruction
del model_oneInstruction
del pipe1

gc.collect()

26347

## Using fine tuned Model two instruction -> Custom + Alpaca

In [16]:
# Load the tokenizer and model
tokenizer_twoInstruction = AutoTokenizer.from_pretrained(fine_tuned_model_twoInstruction)
model_twoInstruction = AutoModelForCausalLM.from_pretrained(fine_tuned_model_twoInstruction, device_map="auto")

totalBlueScore_twoInstruction = []
totalROUEScore_twoInstruction = []
totalBERTScore_twoInstruction = []


for i in range(len(training_instance_forPredictionList)):
    pipe1 = pipeline(task="text-generation", model=model_twoInstruction, tokenizer=tokenizer_twoInstruction)
    result1 = pipe1(training_instance_forPredictionList[i], max_new_tokens=1)
    
    # reference -> ground truth 
    # candidate -> prediction 
    reference = str(training_original[i]['feeling'])
    candidate_notPulled = result1[0]['generated_text']
    candiate_sentimentPulled = str(extract_candidate_sentiment(candidate_notPulled))
    print("Original: ", training_instances[i])
    print("Prompt: ", training_instance_forPredictionList[i])
    print("Reference: ", reference)
    print("Candidate: ", candiate_sentimentPulled)
    print("**********")
    
    
    # reference_tokens = word_tokenize(reference)
    # candidate_tokens = word_tokenize(candiate_sentimentPulled)
    
    # print("Score: ")
    # Calculating BLEU score
    bleu_score = sentence_bleu([reference], candiate_sentimentPulled)
    # print(f"BLEU score: {bleu_score}")
    totalBlueScore_twoInstruction.append(bleu_score)
    
    # Creating a ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Calculating ROUGE-L
    scores = scorer.score(reference, candiate_sentimentPulled)
    # print(f"ROUGE-L score: {scores['rougeL'].fmeasure}")
    totalROUEScore_twoInstruction.append(scores['rougeL'].fmeasure)
    
    # Calculating BERTScore
    P, R, F1 = score([candiate_sentimentPulled], [reference], lang="en", verbose=True)

    # Printing the F1 score
    # print(f"BERTScore F1: {F1.mean().item()}")
    totalBERTScore_twoInstruction.append(F1.mean().item())
    
    # print("-------")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Campfire with the family and kaileys friends ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.42 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Gained more weight ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.86 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @pattieparker - yeah it provides a little outlet. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.09 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressive ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @slbp2008 Your pedantry is impressive ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.74 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hahahh i love my brother @pro42089 he is fucking crazy. well off to ny and nj tomorrow, i really wish sam was going too !!  blah. ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hahahh i love my brother @pro42089 he is fucking crazy. well off to ny and nj tomorrow, i really wish sam was going too !!  blah. ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.69 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @glennbeck Praying for strength &amp; peace for you. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @glennbeck Praying for strength &amp; peace for you. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.87 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: I hate the rain when I'm in a car ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: I hate the rain when I'm in a car ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.03 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @kyrocks  no we will have to hook up soon! ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @kyrocks  no we will have to hook up soon! ### Sentiment: 
Reference:  0
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.36 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @stillsoul Oh! Your soo welcome ...that makes me happy ! Now you must pay it forward and spread the love! ;) ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @stillsoul Oh! Your soo welcome ...that makes me happy ! Now you must pay it forward and spread the love! ;) ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 34.03 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Im so sick of working sundays........ ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Im so sick of working sundays........ ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 31.13 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Now im sad that im gonna miss Quan's grad. from USC ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Now im sad that im gonna miss Quan's grad. from USC ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.67 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @bfheroes Lol And alot of NDA breakers, hackers, and other mischief as well. Good thing I'm around to enforce it. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @bfheroes Lol And alot of NDA breakers, hackers, and other mischief as well. Good thing I'm around to enforce it. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.95 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @shaunjumpnow  ahh i  wish i could be laying in bed all day...but i had 10-3 shift at work...AND im sick as ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @shaunjumpnow  ahh i  wish i could be laying in bed all day...but i had 10-3 shift at work...AND im sick as ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.85 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @hezmcfly haha same. in college so cant be bothered to do the work! and most websites are blocked =/   xxx ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @hezmcfly haha same. in college so cant be bothered to do the work! and most websites are blocked =/   xxx ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.05 seconds, 21.20 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hello Taipei. 9 nights of drinking in a row, and counting, Sorry liver ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: hello Taipei. 9 nights of drinking in a row, and counting, Sorry liver ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.51 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @VerityRobinson how come?!    X ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @VerityRobinson how come?!    X ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.81 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Damn lost my voice no morning singing for me ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: Damn lost my voice no morning singing for me ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.76 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: it's soooooo cold!!! my poor little fingers are suffering ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: it's soooooo cold!!! my poor little fingers are suffering ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.04 seconds, 28.53 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @ohmymae oh  i see. make you're MSN email right now!  i'm online. hahaha. ### Sentiment: 1
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: @ohmymae oh  i see. make you're MSN email right now!  i'm online. hahaha. ### Sentiment: 
Reference:  1
Candidate:  1
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.99 sentences/sec
Original:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: OMFGGGGGGGGGGGG. I FAILED MY RECENT STATS EXAM. FAILED. FAILED. HE SAID 7 PEOPLE FAILED. AND I WAS ONE OF THEM. WHAT IS THIS?! ### Sentiment: 0
Prompt:  Your task is to classify the the text into one of two feelings. Each feeling has two possible values: 0 indicates the text has a negative sentiment, while 1 indicates a positive feeling. ### Text: OMFGGGGGGGGGGGG. I FAILED MY RECENT STATS EXAM. FAILED. FAILED. HE SAID 7 PEOPLE FAILED. AND I WAS ONE OF THEM. WHAT IS THIS?! ### Sentiment: 
Reference:  0
Candidate:  0
**********


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.03 seconds, 33.76 sentences/sec


In [17]:
del tokenizer_twoInstruction
del model_twoInstruction
del pipe1

gc.collect()

26347

# Final Score - Base / One Instruction / Two Instruction Model

In [18]:
print("Final Scores: ")

print("------------  Base -----------------------------")
meanBleuBase = sum(totalBlueScore_base) / len(totalBlueScore_base)
meanROUEScoreBase = sum(totalROUEScore_base) / len(totalROUEScore_base)
meanBERTScoreBase = sum(totalBERTScore_base) / len(totalBERTScore_base)
print("Blue: ", meanBleuBase)
print("ROUGE: ", meanROUEScoreBase)
print("BERTScore: ", meanBERTScoreBase)


print("------------  One Instruction -----------------------------")
meanBleuOneInstruction = sum(totalBlueScore_oneInstruction) / len(totalBlueScore_oneInstruction)
meanROUEScoreOneInstruction = sum(totalROUEScore_oneInstruction) / len(totalROUEScore_oneInstruction)
meanBERTScoreOneInstruction = sum(totalBERTScore_oneInstruction) / len(totalBERTScore_oneInstruction)
print("Blue: ", meanBleuOneInstruction)
print("ROUGE: ", meanROUEScoreOneInstruction)
print("BERTScore: ", meanBERTScoreOneInstruction)


print("------------  Two Instruction -----------------------------")
meanBleuTwoInstruction = sum(totalBlueScore_twoInstruction) / len(totalBlueScore_twoInstruction)
meanROUEScoreTwoInstruction = sum(totalROUEScore_twoInstruction) / len(totalROUEScore_twoInstruction)
meanBERTScoreTwoInstruction = sum(totalBERTScore_twoInstruction) / len(totalBERTScore_twoInstruction)
print("Blue: ", meanBleuTwoInstruction)
print("ROUGE: ", meanROUEScoreTwoInstruction)
print("BERTScore: ", meanBERTScoreTwoInstruction)

Final Scores: 
------------  Base -----------------------------
Blue:  1.1841907931394724e-231
ROUGE:  0.65
BERTScore:  0.9929698824882507
------------  One Instruction -----------------------------
Blue:  1.7307403899730744e-231
ROUGE:  0.95
BERTScore:  0.9990181028842926
------------  Two Instruction -----------------------------
Blue:  1.7307403899730744e-231
ROUGE:  0.95
BERTScore:  0.9990181028842926
