In [7]:
import pandas as pd
# Example test data as a list of dictionaries
test_data = [
    {
        "User Query": "I’m feeling overwhelmed with coursework deadlines. What should I do?",
        "Reference Responses": [
            "Try breaking tasks into smaller steps and setting realistic deadlines. Also, consider speaking with your professor for extensions if needed.",
            "Managing coursework can be stressful. Try using a planner to prioritize tasks and allocate dedicated time slots for each assignment.",
            "Overwhelm is normal during busy periods. Take breaks, use the Pomodoro technique, and seek academic support if necessary."
        ],
        "Candidate Responses": 
"Create a Master Calendar: Use a digital tool like Google Calendar to note down all your deadlines. This will give you a clear overview of what needs to be done and when."
"Break Large Assignments into Smaller Tasks: If you have multiple assignments due in a week, break each one into smaller tasks such as research, outline, draft, and edit. This will make the tasks seem less daunting and more manageable."
"Use Time-Blocking Techniques: Allocate specific time blocks for each task in your calendar. For instance, you could dedicate Monday morning to researching Assignment 1 and Monday afternoon to outlining Assignment 2."
"Set Up Reminder Systems: Use your digital calendar or planner to set reminders for deadlines. This will ensure that you don't forget any important dates."
"Create a Dedicated Study Schedule: Plan your study periods and stick to them. This will help you to stay focused and make the most of your study time."
"Maintain Separate Folders for Each Course: This will help you to stay organized and easily locate the materials you need for each assignment."
},
    {
        "User Query": "How can I balance studies and personal life without feeling guilty?",
        "Reference Responses": [
            "Setting boundaries is key. Allocate specific study hours and dedicate time to personal activities guilt-free.",
            "Academic success depends on balance. Plan your schedule to ensure you have time for rest and social activities.",
            "Feeling guilty is common, but self-care improves productivity. Treat breaks as necessary, not a distraction."
        ],
        "Candidate Responses": "Create a Structured Daily Schedule: Designate specific hours for studying, for example, from 9 AM to 6 PM. This helps to create a clear distinction between your study time and personal time, allowing you to fully focus on each aspect without feeling guilty about neglecting the other."
"Set Boundaries for Study and Personal Time: Protect your evening time for relaxation, hobbies, or socializing. This helps to ensure that you have sufficient time to unwind and recharge, which is crucial for maintaining your mental health and overall well-being."
"Prioritize Physical Activity and Social Connections: Regular physical activity can help to reduce stress and improve your mood, while maintaining social connections can provide emotional support and a sense of belonging."
"Use a Planner or Digital Calendar: This can help you to track deadlines and events, and manage your time more effectively. You can schedule study blocks and free time to prevent burnout, and prioritize tasks based on their urgency and importance."
"Learn to Say No: If social invites clash with crucial deadlines, it's important to prioritize your academic responsibilities. However, remember to celebrate small academic wins to stay motivated and feel a sense of achievement."
    },
    # You can add more test items as needed...
]

# Convert the test data into a pandas DataFrame for easier manipulation and review
df = pd.DataFrame(test_data)
print(df)


                                          User Query  \
0  I’m feeling overwhelmed with coursework deadli...   
1  How can I balance studies and personal life wi...   

                                 Reference Responses  \
0  [Try breaking tasks into smaller steps and set...   
1  [Setting boundaries is key. Allocate specific ...   

                                 Candidate Responses  
0  Create a Master Calendar: Use a digital tool l...  
1  Create a Structured Daily Schedule: Designate ...  


In [8]:
def compute_f1(candidate, reference):
    """
    Compute token-level F1 score between candidate and reference strings.
    """
    # Tokenize and convert to lower case for consistency
    candidate_tokens = word_tokenize(candidate.lower())
    reference_tokens = word_tokenize(reference.lower())
    
    # Count the overlapping tokens (accounting for frequency)
    common = Counter(candidate_tokens) & Counter(reference_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0.0

    precision = num_same / len(candidate_tokens)
    recall = num_same / len(reference_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1


In [9]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Create a smoothing function instance (helps when sentences are short)
cc = SmoothingFunction()

def compute_bleu(candidate, references):
    """
    Compute the BLEU score for a candidate response against multiple reference responses.
    """
    candidate_tokens = word_tokenize(candidate.lower())
    references_tokens = [word_tokenize(ref.lower()) for ref in references]
    
    # Calculate BLEU score with smoothing
    bleu_score = sentence_bleu(references_tokens, candidate_tokens, smoothing_function=cc.method1)
    return bleu_score


In [10]:
def compute_rouge(candidate, references):
    """
    Compute average ROUGE-1 and ROUGE-L f-measures for the candidate against multiple references.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    
    # Calculate ROUGE scores for each reference response
    scores = [scorer.score(ref, candidate) for ref in references]
    
    avg_scores = {}
    for key in scores[0].keys():
        avg_scores[key] = sum(score[key].fmeasure for score in scores) / len(scores)
    return avg_scores


In [12]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from collections import Counter  # Importing Counter here
from rouge_score import rouge_scorer
from bert_score import score

results = []

for idx, row in df.iterrows():
    user_query = row["User Query"]
    references = row["Reference Responses"]
    candidate = row["Candidate Responses"]
    
    # Existing evaluation metrics
    f1_scores = [compute_f1(candidate, ref) for ref in references]
    best_f1 = max(f1_scores)
    bleu = compute_bleu(candidate, references)
    rouge = compute_rouge(candidate, references)
    
    # Compute BERTScore:
    # Here, we'll compute the maximum BERTScore across references.
    bert_f1_scores = []
    for ref in references:
        _, _, bert_F1 = score([candidate], [ref], lang="en", verbose=False)
        bert_f1_scores.append(bert_F1[0].item())
    max_bert_f1 = max(bert_f1_scores)
    
    results.append({
        "User Query": user_query,
        "Candidate Response": candidate,
        "F1 Score": best_f1,
        "BLEU Score": bleu,
        "ROUGE-1": rouge['rouge1'],
        "ROUGE-L": rouge['rougeL'],
        "BERTScore F1": max_bert_f1
    })

results_df = pd.DataFrame(results)
print(results_df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\osato\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for

                                          User Query  \
0  I’m feeling overwhelmed with coursework deadli...   
1  How can I balance studies and personal life wi...   

                                  Candidate Response  F1 Score  BLEU Score  \
0  Create a Master Calendar: Use a digital tool l...  0.131455    0.022593   
1  Create a Structured Daily Schedule: Designate ...  0.121212    0.005283   

    ROUGE-1   ROUGE-L  BERTScore F1  
0  0.106902  0.083468      0.865677  
1  0.094401  0.075503      0.868632  


In [14]:
import pandas as pd
from transformers import pipeline

# Initialize the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Define the sentiment analysis function
def analyze_sentiment(response):
    result = sentiment_analyzer(response)
    # Extract the label and score from the result
    return result[0]["label"], result[0]["score"]

# Assuming your DataFrame has a column named "Candidate Response"
# If your column is actually "Candidate Responses", adjust accordingly.
df["Sentiment"], df["Sentiment Score"] = zip(*df["Candidate Responses"].apply(analyze_sentiment))

# Print the DataFrame columns to verify
print(df[["Candidate Responses", "Sentiment", "Sentiment Score"]])


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


                                 Candidate Responses Sentiment  \
0  Create a Master Calendar: Use a digital tool l...  POSITIVE   
1  Create a Structured Daily Schedule: Designate ...  POSITIVE   

   Sentiment Score  
0         0.938456  
1         0.997873  


In [13]:
pip install bert-score

Collecting bert-scoreNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for bert-score from https://files.pythonhosted.org/packages/c6/8c/bc5457de4c004b1a623b31f7bc8d0375fb699b7d67df11879098b4b7b7c8/bert_score-0.3.13-py3-none-any.whl.metadata
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
   ---------------------------------------- 0.0/61.1 kB ? eta -:--:--
   ------ --------------------------------- 10.2/61.1 kB ? eta -:--:--
   --------------------------------- ------ 51.2/61.1 kB 660.6 kB/s eta 0:00:01
   ---------------------------------------- 61.1/61.1 kB 820.9 kB/s eta 0:00:00
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13
