In [1]:
import os
import time
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, LLMChain
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_core.retrievers import BaseRetriever
from langchain.retrievers import EnsembleRetriever
from evaluate import load
from sentence_transformers import SentenceTransformer, util
from docx import Document as DocxDocument
from pydantic import Field
from typing import List, Any
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader(file_path="mental_health_counseling_conversations_final_v2.csv", encoding="ISO-8859-1")
documents = loader.load()

In [3]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)


Created a chunk of size 777, which is longer than the specified 500
Created a chunk of size 1861, which is longer than the specified 500
Created a chunk of size 540, which is longer than the specified 500
Created a chunk of size 774, which is longer than the specified 500
Created a chunk of size 1882, which is longer than the specified 500


In [4]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(split_docs, embedding_model, persist_directory="./chroma_db")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [5]:
# ------------------- Load Mistral Locally via Ollama -------------------
llm = Ollama(model="mistral")


  llm = Ollama(model="mistral")


In [6]:
# ------------------- Define Prompts -------------------
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a compassionate mental health assistant.
Use the following context to answer the question empathetically.

Context: {context}
Question: {question}
Answer:
"""
)
vanilla_prompt = PromptTemplate(
    input_variables=["question"],
    template="You are a compassionate mental health assistant. Answer the following question empathetically:\n{question}\nAnswer:"
)



In [7]:
# ------------------- Create Chains -------------------
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template},
)

vanilla_chain = LLMChain(llm=llm, prompt=vanilla_prompt)
retrieval_only_chain = vectorstore.as_retriever()


  vanilla_chain = LLMChain(llm=llm, prompt=vanilla_prompt)


In [8]:
# ------------------- Setup Hybrid Retrieval -------------------
vectorstore_dense = Chroma.from_documents(split_docs, embedding_model, persist_directory="./chroma_db_dense")

texts = [doc.page_content for doc in split_docs]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

class SparseRetriever(BaseRetriever):
    texts: List[str] = Field(...)
    vectorizer: Any = Field(...)
    matrix: Any = Field(...)

    def _get_relevant_documents(self, query, *, run_manager=None):
        query_vec = self.vectorizer.transform([query])
        similarities = (self.matrix * query_vec.T).toarray().flatten()
        top_indices = similarities.argsort()[::-1][:3]
        return [Document(page_content=self.texts[i]) for i in top_indices]

    async def aget_relevant_documents(self, query, *, run_manager=None):
        return self._get_relevant_documents(query)

sparse_retriever = SparseRetriever(texts=texts, vectorizer=tfidf_vectorizer, matrix=tfidf_matrix)

retriever_hybrid = EnsembleRetriever(
    retrievers=[
        vectorstore_dense.as_retriever(),
        sparse_retriever
    ]
)

qa_chain_hybrid = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever_hybrid,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template},
)


  class SparseRetriever(BaseRetriever):


In [9]:
# ------------------- Load Metrics -------------------
bleu = load("bleu")
rouge = load("rouge")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

Using the latest cached version of the module from C:\Users\saibe\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--bleu\9e0985c1200e367cce45605ce0ecb5ede079894e0f24f54613fca08eeb8aff76 (last modified on Fri Apr 25 22:03:28 2025) since it couldn't be found locally at evaluate-metric--bleu, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\saibe\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--rouge\b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Fri Apr 25 22:06:52 2025) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.


In [10]:
def evaluate_response(model_name, reference, prediction, start_time, end_time, embedder):
    reference_tokens = reference.split()
    prediction_tokens = prediction.split()

    # BLEU Scores
    bleu1 = bleu.compute(predictions=[prediction], references=[[reference]], max_order=1)['bleu']
    bleu2 = bleu.compute(predictions=[prediction], references=[[reference]], max_order=2)['bleu']
    bleu4 = bleu.compute(predictions=[prediction], references=[[reference]], max_order=4)['bleu']

    # ROUGE Scores
    rouge_scores = rouge.compute(predictions=[prediction], references=[reference])
    rouge1 = rouge_scores['rouge1']
    rouge2 = rouge_scores['rouge2']
    rougeL = rouge_scores['rougeL']

    # Lengths
    reference_length = len(reference_tokens)
    prediction_length = len(prediction_tokens)
    reference_chars = len(reference)
    prediction_chars = len(prediction)

    # Contextual Relevance
    ref_embed = embedder.encode(reference, convert_to_tensor=True)
    pred_embed = embedder.encode(prediction, convert_to_tensor=True)
    contextual_relevance = util.cos_sim(ref_embed, pred_embed).item()

    # Semantic Similarity
    semantic_similarity = util.cos_sim(ref_embed, pred_embed).item()

    # Diversity
    unigrams = set(prediction_tokens)
    bigrams = set(ngrams(prediction_tokens, 2)) if len(prediction_tokens) >= 2 else set()
    distinct1 = len(unigrams) / (len(prediction_tokens) + 1e-10)
    distinct2 = len(bigrams) / (len(prediction_tokens) + 1e-10)

    # Time
    response_time = end_time - start_time

    # Return results dictionary
    result = {
        f"{model_name} BLEU-1": bleu1,
        f"{model_name} BLEU-2": bleu2,
        f"{model_name} BLEU-4": bleu4,
        f"{model_name} ROUGE-1": rouge1,
        f"{model_name} ROUGE-2": rouge2,
        f"{model_name} ROUGE-L": rougeL,
        f"{model_name} Reference Length (tokens)": reference_length,
        f"{model_name} Prediction Length (tokens)": prediction_length,
        f"{model_name} Reference Length (chars)": reference_chars,
        f"{model_name} Prediction Length (chars)": prediction_chars,
        f"{model_name} Contextual Relevance": contextual_relevance,
        f"{model_name} Semantic Similarity": semantic_similarity,
        f"{model_name} Distinct-1": distinct1,
        f"{model_name} Distinct-2": distinct2,
        f"{model_name} Response Time (s)": response_time,
    }

    return result


In [11]:
# ------------------- Evaluate -------------------
test_questions = [
    "Why do I feel anxious even when nothing is wrong?",
    "How can I stop overthinking every situation?",
    "What should I do when I feel sad for no reason?",
    "I often feel lonely even around people. Why?",
    "How do I deal with feelings of guilt I can't shake off?",
    "What are healthy ways to manage anger?",
    "Why do I sometimes cry without knowing why?",
    "How can I stop feeling emotionally numb?",
    "What should I do when I feel overwhelmed with emotions?",
    "How do I handle intense mood swings?",
    "How do I set healthy boundaries with my family?",
    "Why do I feel insecure in my relationship?",
    "How can I rebuild trust after being betrayed?",
    "What if I feel disconnected from my partner?",
    "How do I handle toxic friendships?",
    "How can I improve communication with my parents?",
    "I’m scared of abandonment. How do I cope?",
    "How do I manage jealousy in a relationship?",
    "What are signs of an unhealthy friendship?",
    "How can I move on from a breakup?",
    "How do I deal with constant work pressure?",
    "I feel burnt out but can't quit. What can I do?",
    "How can I manage imposter syndrome at my job?",
    "How do I stay motivated when I feel stuck in my career?",
    "What should I do when my job causes anxiety?",
    "How do I balance work and personal life?",
    "How can I stop procrastinating important tasks?",
    "How do I handle criticism at work without breaking down?",
    "What are signs that I need a break from work?",
    "How do I stop feeling guilty about resting?",
    "Why do I feel like I'm never good enough?",
    "How can I build real self-confidence?",
    "How do I stop comparing myself to others?",
    "How do I accept parts of myself that I don't like?",
    "Why do compliments make me uncomfortable?",
    "How can I stop feeling like a failure?",
    "How do I deal with body image issues?",
    "What should I do if I don't know who I am anymore?",
    "How do I build a stronger sense of identity?",
    "How can I forgive myself for past mistakes?",
    "What are quick ways to calm down anxiety?",
    "How can breathing exercises help with stress?",
    "What are grounding techniques for panic attacks?",
    "How do I create a safe mental space when overwhelmed?",
    "What are healthy distractions when feeling down?",
    "How can journaling improve my mental health?",
    "What are some beginner mindfulness practices?",
    "How do I deal with intrusive thoughts?",
    "What are ways to self-soothe during a crisis?",
    "How do I make a coping skills plan for bad days?",
    "What do I do if I feel like life has no purpose?",
    "How do I find meaning during tough times?",
    "How can I stop feeling lost in life?",
    "Why do I fear change so much?",
    "How do I cope with existential dread?",
    "What if I feel like my life doesn't matter?",
    "How can I rediscover passion and joy?",
    "How do I handle fear of death and dying?",
    "How do I accept uncertainty in life?",
    "How can I feel more grounded when the world feels chaotic?",
    "What are common signs of depression?",
    "How do I manage daily life with PTSD?",
    "What are helpful strategies for OCD compulsions?",
    "How can I support someone with anxiety?",
    "What should I know about social anxiety?",
    "How do I cope with health anxiety?",
    "What are the differences between sadness and clinical depression?",
    "How do I manage depressive episodes without medication?",
    "How can I prepare for therapy sessions?",
    "What if therapy isn't helping me?",
    "How important is sleep for mental health?",
    "What are healthy sleep habits I can build?",
    "How does nutrition impact my mood?",
    "Can exercise really help with depression?",
    "How do I stay consistent with self-care routines?",
    "How do I practice gratitude when I feel hopeless?",
    "What is the role of hydration in mental well-being?",
    "How do I make time for self-care when life is busy?",
    "How can nature and time outside help my mental health?",
    "What are simple self-care ideas for when I'm low energy?",
    "What should I do during a mental health crisis?",
    "How can I build a crisis safety plan?",
    "Who can I reach out to if I feel unsafe?",
    "How do I manage suicidal thoughts?",
    "How do I talk to someone about my mental health struggles?",
    "What are small steps I can take when I feel like giving up?",
    "How can I break out of isolation during a crisis?",
    "What should I remember during a panic attack?",
    "How do I convince myself to seek help when it's hard?",
    "How do I ground myself when I'm dissociating?",
    "How do I build emotional resilience?",
    "How can I learn to tolerate emotional pain better?",
    "What does healthy emotional expression look like?",
    "How do I stay hopeful during dark times?",
    "How do I maintain mental health in uncertain times?",
    "How do I cope with setbacks and failures?",
    "How can I trust myself more?",
    "How can affirmations help with self-esteem?",
    "How do I develop a growth mindset?",
    "How can I celebrate small victories in healing?"
]

In [12]:

results = []
doc = DocxDocument()

for i, question in enumerate(test_questions):
    # RAG
    start = time.time()
    rag_response = rag_chain.run({"query": question})
    end = time.time()
    rag_metrics = evaluate_response("RAG", question, rag_response, start, end, embedder)

    # Vanilla
    start = time.time()
    vanilla_response = vanilla_chain.run({"question": question})
    end = time.time()
    vanilla_metrics = evaluate_response("Vanilla", question, vanilla_response, start, end, embedder)

    # Hybrid
    start = time.time()
    hybrid_response = qa_chain_hybrid.run({"query": question})
    end = time.time()
    hybrid_metrics = evaluate_response("Hybrid", question, hybrid_response, start, end, embedder)

    # Retrieval Only
    start = time.time()
    retrieved_docs = retrieval_only_chain.get_relevant_documents(question)
    retrieval_response = retrieved_docs[0].page_content if retrieved_docs else "No relevant context found."
    end = time.time()
    retrieval_metrics = evaluate_response("RetrievalOnly", question, retrieval_response, start, end, embedder)
    
    # Combine all results
    combined_result = {
        "Test #": i + 1,
        "Question": question,
        **rag_metrics,
        **vanilla_metrics,
        **hybrid_metrics,
        **retrieval_metrics,
    }
    results.append(combined_result)

    # Save responses in doc
    doc.add_heading(f"Test {i+1}: {question}", level=1)
    doc.add_paragraph(f"🔹 RAG Response:\n{rag_response}")
    doc.add_paragraph(f"🔸 Vanilla Response:\n{vanilla_response}")
    doc.add_paragraph(f"🌀 Hybrid Response:\n{hybrid_response}")
    doc.add_paragraph("\n---\n")

    # ✅ Print progress after each question
    print(f"✅ Completed {i+1}/{len(test_questions)} questions")


  rag_response = rag_chain.run({"query": question})
INFO:backoff:Backing off send_request(...) for 0.1s (requests.exceptions.ConnectionError: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002DDC94BB390>: Failed to resolve 'us.i.posthog.com' ([Errno 11001] getaddrinfo failed)")))
INFO:backoff:Backing off send_request(...) for 0.1s (requests.exceptions.ConnectionError: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002DDC94B8CD0>: Failed to resolve 'us.i.posthog.com' ([Errno 11001] getaddrinfo failed)")))
INFO:backoff:Backing off send_request(...) for 1.5s (requests.exceptions.ConnectionError: HTTPSConnectionPool(host='us.i.posthog.com', port=443): Max retries exceeded with url: /batch/ (Caused by NameResolutionError("<urllib3.co

✅ Completed 1/100 questions
✅ Completed 2/100 questions
✅ Completed 3/100 questions
✅ Completed 4/100 questions
✅ Completed 5/100 questions
✅ Completed 6/100 questions
✅ Completed 7/100 questions
✅ Completed 8/100 questions
✅ Completed 9/100 questions
✅ Completed 10/100 questions
✅ Completed 11/100 questions
✅ Completed 12/100 questions
✅ Completed 13/100 questions
✅ Completed 14/100 questions
✅ Completed 15/100 questions
✅ Completed 16/100 questions
✅ Completed 17/100 questions
✅ Completed 18/100 questions
✅ Completed 19/100 questions
✅ Completed 20/100 questions
✅ Completed 21/100 questions
✅ Completed 22/100 questions
✅ Completed 23/100 questions
✅ Completed 24/100 questions
✅ Completed 25/100 questions
✅ Completed 26/100 questions
✅ Completed 27/100 questions
✅ Completed 28/100 questions
✅ Completed 29/100 questions
✅ Completed 30/100 questions
✅ Completed 31/100 questions
✅ Completed 32/100 questions
✅ Completed 33/100 questions
✅ Completed 34/100 questions
✅ Completed 35/100 ques

In [13]:
results


[{'Test #': 1,
  'Question': 'Why do I feel anxious even when nothing is wrong?',
  'RAG BLEU-1': 0.018987341772151903,
  'RAG BLEU-2': 0.013447372472027633,
  'RAG BLEU-4': 0.007788569860014899,
  'RAG ROUGE-1': 0.04166666666666667,
  'RAG ROUGE-2': 0.02097902097902098,
  'RAG ROUGE-L': 0.034722222222222224,
  'RAG Reference Length (tokens)': 10,
  'RAG Prediction Length (tokens)': 275,
  'RAG Reference Length (chars)': 49,
  'RAG Prediction Length (chars)': 1809,
  'RAG Contextual Relevance': 0.7952066659927368,
  'RAG Semantic Similarity': 0.7952066659927368,
  'RAG Distinct-1': 0.6654545454543035,
  'RAG Distinct-2': 0.9418181818178394,
  'RAG Response Time (s)': 18.994036436080933,
  'Vanilla BLEU-1': 0.018018018018018025,
  'Vanilla BLEU-2': 0.0,
  'Vanilla BLEU-4': 0.0,
  'Vanilla ROUGE-1': 0.037383177570093455,
  'Vanilla ROUGE-2': 0.0,
  'Vanilla ROUGE-L': 0.028037383177570093,
  'Vanilla Reference Length (tokens)': 10,
  'Vanilla Prediction Length (tokens)': 200,
  'Vanilla R

In [14]:
final_df = pd.DataFrame(results)
final_df

Unnamed: 0,Test #,Question,RAG BLEU-1,RAG BLEU-2,RAG BLEU-4,RAG ROUGE-1,RAG ROUGE-2,RAG ROUGE-L,RAG Reference Length (tokens),RAG Prediction Length (tokens),...,RetrievalOnly ROUGE-L,RetrievalOnly Reference Length (tokens),RetrievalOnly Prediction Length (tokens),RetrievalOnly Reference Length (chars),RetrievalOnly Prediction Length (chars),RetrievalOnly Contextual Relevance,RetrievalOnly Semantic Similarity,RetrievalOnly Distinct-1,RetrievalOnly Distinct-2,RetrievalOnly Response Time (s)
0,1,Why do I feel anxious even when nothing is wrong?,0.018987,0.013447,0.007789,0.041667,0.020979,0.034722,10,275,...,0.016194,10,699,49,3730,0.624702,0.624702,0.476395,0.902718,0.013345
1,2,How can I stop overthinking every situation?,0.009231,0.000000,0.000000,0.020548,0.000000,0.020548,7,281,...,0.011928,7,1078,44,4014,0.477822,0.477822,0.267161,0.429499,0.012905
2,3,What should I do when I feel sad for no reason?,0.029240,0.022716,0.013808,0.082840,0.035928,0.071006,11,154,...,0.072993,11,118,47,643,0.617290,0.617290,0.694915,0.949153,0.012226
3,4,I often feel lonely even around people. Why?,0.032787,0.000000,0.000000,0.055249,0.011173,0.033149,8,167,...,0.046154,8,114,44,690,0.720264,0.720264,0.657895,0.964912,0.012395
4,5,How do I deal with feelings of guilt I can't s...,0.014535,0.011275,0.000000,0.038095,0.019169,0.031746,12,291,...,0.050000,12,214,55,1329,0.549433,0.549433,0.668224,0.971963,0.009307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,How do I cope with setbacks and failures?,0.010870,0.009426,0.006039,0.024242,0.018293,0.024242,8,314,...,0.014760,8,256,41,1396,0.381530,0.381530,0.625000,0.917969,0.014025
96,97,How can I trust myself more?,0.010526,0.000000,0.000000,0.023256,0.000000,0.023256,6,331,...,0.055556,6,130,28,785,0.647739,0.647739,0.807692,0.946154,0.008739
97,98,How can affirmations help with self-esteem?,0.019920,0.000000,0.000000,0.051502,0.017316,0.034335,6,212,...,0.029412,6,329,43,1815,0.554498,0.554498,0.583587,0.951368,0.009479
98,99,How do I develop a growth mindset?,0.019231,0.012186,0.000000,0.042373,0.017094,0.042373,7,223,...,0.056738,7,128,34,706,0.433122,0.433122,0.695312,0.968750,0.011111


In [15]:

# ------------------- Save Outputs -------------------
final_df = pd.DataFrame(results)
final_df.to_csv("evaluation_results_v5.csv", index=False)
doc.save("evaluation_responses_v5.docx")

print("✅ Final evaluation_results.csv and evaluation_responses.docx saved!")

✅ Final evaluation_results.csv and evaluation_responses.docx saved!


In [16]:
# ------------------- Compute Averages -------------------
# Drop non-numeric columns first (Test #, Question)
numeric_df = final_df.drop(columns=["Test #", "Question"])

# Compute mean for each column
average_metrics = numeric_df.mean().to_frame(name="Average Score")

# Save to a new CSV file
average_metrics.to_csv("evaluation_results_averagev5.csv")

print("✅ Final evaluation_results_averages.csv saved!")


✅ Final evaluation_results_averages.csv saved!


In [17]:
# ------------------- Compute and Group Averages -------------------

# Drop non-metric columns
numeric_df = final_df.drop(columns=["Test #", "Question"])

# Compute mean
mean_series = numeric_df.mean()

# Prepare grouped table
grouped_data = {}

# Go through each metric
for col in mean_series.index:
    # Split column name into model and metric parts
    model_name = col.split(' ')[0]  # First word (RAG / Vanilla / Hybrid)
    metric_name = ' '.join(col.split(' ')[1:])  # Rest is metric (BLEU-1, ROUGE-1, etc.)

    if metric_name not in grouped_data:
        grouped_data[metric_name] = {}

    grouped_data[metric_name][model_name] = mean_series[col]

# Convert into DataFrame
grouped_df = pd.DataFrame(grouped_data).T  # Transpose to get Metrics as rows
grouped_df = grouped_df[["RAG", "Vanilla", "Hybrid","RetrievalOnly"]]  # Order columns manually

# Save grouped result
grouped_df.to_csv("evaluation_results_grouped_averages_v5.csv")

print("✅ Final evaluation_results_grouped_averages.csv saved!")


✅ Final evaluation_results_grouped_averages.csv saved!


In [18]:
# ------------------- Empathy Rewriter Chain -------------------

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Define a new empathy rewriter prompt
empathy_rewriter_prompt = PromptTemplate(
    input_variables=["text"],
    template="""
You are a compassionate editor.
Rewrite the following mental health response to sound more emotionally supportive, validating, and human-like.
Preserve the meaning but enhance emotional warmth, kindness, and encouragement.

Original Response:
{text}

Empathy-Enhanced Response:
"""
)

# Build a second chain for rewriting using the same LLM (Mistral)
empathy_rewriter_chain = LLMChain(llm=llm, prompt=empathy_rewriter_prompt)


In [19]:
# ------------------- New Block: Empathy Rewriter for RAG -------------------

print("💗 Starting Empathy Rewriting for RAG Responses...")

# New Results Storage
empathy_rag_results = []
empathy_doc = DocxDocument()

for i, question in enumerate(test_questions):
    # Step 1: Normal RAG Response
    start = time.time()
    rag_response = rag_chain.run({"query": question})
    # Step 2: Empathy Rewriting
    rag_response_empathic = empathy_rewriter_chain.run({"text": rag_response})
    end = time.time()

    # Step 3: Evaluate Empathic RAG
    empathy_rag_metrics = evaluate_response("Empathy-RAG", question, rag_response_empathic, start, end, embedder)

    empathy_rag_results.append({
        "Test #": i + 1,
        "Question": question,
        **empathy_rag_metrics
    })

    # Step 4: Save in Word
    empathy_doc.add_heading(f"Empathy-RAG Test {i+1}: {question}", level=1)
    empathy_doc.add_paragraph(f"🧠 Original RAG Response:\n{rag_response}")
    empathy_doc.add_paragraph(f"💗 Empathy-Rewritten Response:\n{rag_response_empathic}")
    empathy_doc.add_paragraph("\n---\n")

    print(f"✅ Empathy Rewrite Completed {i+1}/{len(test_questions)} questions")

# Step 5: Save all Empathy RAG Outputs

# Save detailed results
empathy_rag_df = pd.DataFrame(empathy_rag_results)
empathy_rag_df.to_csv("evaluation_results_empathy_rag.csv", index=False)
empathy_doc.save("evaluation_responses_empathy_rag.docx")

print("🎉 Detailed empathy RAG evaluation saved: evaluation_results_empathy_rag.csv and evaluation_responses_empathy_rag.docx")

# Step 6: Compute Averages (for empathy RAG)

# Drop non-numeric columns first
empathy_numeric_df = empathy_rag_df.drop(columns=["Test #", "Question"])

# Compute mean for each metric
empathy_average_metrics = empathy_numeric_df.mean().to_frame(name="Average Score")

# Save to a new CSV
empathy_average_metrics.to_csv("evaluation_results_empathy_rag_averages.csv")

print("✅ Empathy-RAG average metrics saved: evaluation_results_empathy_rag_averages.csv")




💗 Starting Empathy Rewriting for RAG Responses...
✅ Empathy Rewrite Completed 1/100 questions
✅ Empathy Rewrite Completed 2/100 questions
✅ Empathy Rewrite Completed 3/100 questions
✅ Empathy Rewrite Completed 4/100 questions
✅ Empathy Rewrite Completed 5/100 questions
✅ Empathy Rewrite Completed 6/100 questions
✅ Empathy Rewrite Completed 7/100 questions
✅ Empathy Rewrite Completed 8/100 questions
✅ Empathy Rewrite Completed 9/100 questions
✅ Empathy Rewrite Completed 10/100 questions
✅ Empathy Rewrite Completed 11/100 questions
✅ Empathy Rewrite Completed 12/100 questions
✅ Empathy Rewrite Completed 13/100 questions
✅ Empathy Rewrite Completed 14/100 questions
✅ Empathy Rewrite Completed 15/100 questions
✅ Empathy Rewrite Completed 16/100 questions
✅ Empathy Rewrite Completed 17/100 questions
✅ Empathy Rewrite Completed 18/100 questions
✅ Empathy Rewrite Completed 19/100 questions
✅ Empathy Rewrite Completed 20/100 questions
✅ Empathy Rewrite Completed 21/100 questions
✅ Empathy Rewr

In [20]:
# Step 7: Group Averages by Metric Type

# Prepare grouped table
empathy_grouped_data = {}

# Go through each metric
for col in empathy_average_metrics.index:
    # Split column name into model and metric parts
    model_name = col.split(' ')[0]  # "Empathy-RAG"
    metric_name = ' '.join(col.split(' ')[1:])  # BLEU-1, ROUGE-L, etc.

    if metric_name not in empathy_grouped_data:
        empathy_grouped_data[metric_name] = {}

    empathy_grouped_data[metric_name][model_name] = empathy_average_metrics.loc[col, "Average Score"]

# Convert into DataFrame
empathy_grouped_df = pd.DataFrame(empathy_grouped_data).T  # Transpose to get metrics as rows
empathy_grouped_df = empathy_grouped_df[["Empathy-RAG"]]  # Order columns manually

# Save grouped results
empathy_grouped_df.to_csv("evaluation_results_empathy_rag_grouped_averages.csv")

print("✅ Empathy-RAG grouped averages saved: evaluation_results_empathy_rag_grouped_averages.csv")

# ------------------- End of Empathy Rewriter Block -------------------

✅ Empathy-RAG grouped averages saved: evaluation_results_empathy_rag_grouped_averages.csv
