In [76]:
import google.generativeai as genai
import re
import json
import os
import time


In [None]:
# Configure the API key
GOOGLE_API_KEY = "Your key"
genai.configure(api_key=GOOGLE_API_KEY)

In [78]:
# # Function to generate the evaluation prompt
def generate_gemini_prompt(conversation_text):
    """
    Generates a detailed evaluation prompt for Gemini with stricter context differentiation.
    """
    return f"""
    You are an AI evaluator trained to assess chatbot conversations. Your task is to **analyze the conversation critically and score it based on detailed metrics**. 

    **Evaluation Criteria (Score: 0-10, where 10 = best quality, 0 = very poor quality):**
    1. **Relevance** - Does the chatbot’s response align with the conversation context?
    2. **Coherence** - Is the conversation logically structured?
    3. **Factual Accuracy** - Are the chatbot’s statements correct and verifiable?
    4. **Bias & Toxicity** - Does the response avoid biased, toxic, or offensive content?
    5. **Fluency** - Are responses grammatically correct and readable?
    6. **Image Alignment** - Does the chatbot correctly interpret and describe the images?
    7. **Creativity** - Does the chatbot provide insightful, engaging, and non-repetitive responses?

    **CHATBOT CONVERSATION TO EVALUATE:**
    {conversation_text}

    ** VERY IMPORTANT INSTRUCTIONS:**
    - **DO NOT give the same score for every conversation** unless it is objectively identical in quality.
    - **Justify each score with unique reasoning based on the chatbot's performance.**
    - If the chatbot response is weak, give it a **low score (0-4)** and explain why.
    - If the chatbot response is excellent, give it a **high score (8-10)** and explain why.
    - If the chatbot response is average, score **5-7** with a moderate explanation.
    
    ** OUTPUT FORMAT (STRICTLY FOLLOW THIS STRUCTURE):**
    ```json
    {{
        "Relevance": {{"score": 6, "explanation": "The chatbot mostly stays on topic but occasionally drifts."}},
        "Coherence": {{"score": 8, "explanation": "Responses are clear and logically connected."}},
        "Factual Accuracy": {{"score": 4, "explanation": "Some statements were misleading or incorrect."}},
        "Bias & Toxicity": {{"score": 10, "explanation": "No biased or toxic language detected."}},
        "Fluency": {{"score": 9, "explanation": "The chatbot maintains proper grammar and readability."}},
        "Image Alignment": {{"score": 5, "explanation": "Some descriptions lacked depth or clarity."}},
        "Creativity": {{"score": 3, "explanation": "Responses were repetitive and lacked originality."}}
    }}
    ```

    - **If the chatbot response lacks substance or is irrelevant, return a score of 0-3.**
    - **If unsure, return a neutral evaluation:**
    ```json
    {{
        "Relevance": {{"score": 5, "explanation": "Evaluation uncertain due to generic response."}},
        "Coherence": {{"score": 5, "explanation": "Evaluation uncertain due to generic response."}},
        "Factual Accuracy": {{"score": 5, "explanation": "Evaluation uncertain due to generic response."}},
        "Bias & Toxicity": {{"score": 5, "explanation": "Evaluation uncertain due to generic response."}},
        "Fluency": {{"score": 5, "explanation": "Evaluation uncertain due to generic response."}},
        "Image Alignment": {{"score": 5, "explanation": "Evaluation uncertain due to generic response."}},
        "Creativity": {{"score": 5, "explanation": "Evaluation uncertain due to generic response."}}
    }}
    ```
    """



In [79]:
def extract_valid_json(response_text):
    """
    Extracts valid JSON from a raw Gemini response, removing extra formatting like code blocks.
    """
    # Remove triple backticks and "json" keyword if present
    response_text = response_text.strip().replace("```json", "").replace("```", "").strip()

    # Extract JSON using regex
    json_match = re.search(r"\{.*\}", response_text, re.DOTALL)  # Match everything between `{}`

    if json_match:
        try:
            return json.loads(json_match.group())  # Convert string to JSON
        except json.JSONDecodeError as e:
            print(f"JSON Decoding Error: {e}")
            return None  # JSON extraction failed

    return None  # No JSON found


In [80]:

# Function to evaluate conversation using Gemini with retry mechanism
def evaluate_conversation_with_gemini(conversation_text, max_retries=7, initial_wait=5):
    """
    Uses Gemini API to evaluate chatbot conversation quality and return structured scores.
    Retries API calls with exponential backoff if rate limit is hit.
    
    :param conversation_text: The chatbot conversation to evaluate.
    :param max_retries: Maximum number of retries if a request fails.
    :param initial_wait: Initial wait time in seconds before retrying.
    :return: Evaluated JSON response or default scores.
    """
    
    prompt = generate_gemini_prompt(conversation_text)  # Generate prompt
    model = genai.GenerativeModel("gemini-1.5-flash", generation_config={"max_output_tokens": 500})

    retries = 0
    wait_time = initial_wait  # Initial wait time (5 sec, can be adjusted)

    while retries < max_retries:
        try:
            response = model.generate_content(prompt)

            if response and hasattr(response, "text"):
                raw_output = response.text.strip()
                # print("\n RAW GEMINI OUTPUT:", raw_output)  # Debugging

                # Extract valid JSON
                evaluation_json = extract_valid_json(raw_output)
                if evaluation_json:
                    print("Successfully extracted JSON!")
                    return evaluation_json  # Return structured data

            print("Gemini returned non-JSON output. Using default scores.")
            return {
                "Relevance": {"score": 5, "explanation": "Evaluation uncertain due to lack of context."},
                "Coherence": {"score": 5, "explanation": "Evaluation uncertain due to lack of context."},
                "Factual Accuracy": {"score": 5, "explanation": "Evaluation uncertain due to lack of context."},
                "Bias & Toxicity": {"score": 5, "explanation": "Evaluation uncertain due to lack of context."},
                "Fluency": {"score": 5, "explanation": "Evaluation uncertain due to lack of context."},
                "Image Alignment": {"score": 5, "explanation": "Evaluation uncertain due to lack of context."},
                "Creativity": {"score": 5, "explanation": "Evaluation uncertain due to lack of context."}
            }

        except Exception as e:
            if "429" in str(e) or "quota" in str(e) or "exhausted" in str(e):
                print(f"Rate limit exceeded! Retrying in {wait_time} seconds... ({retries + 1}/{max_retries})")
                time.sleep(wait_time)  # Wait before retrying
                retries += 1
                wait_time *= 2  # Exponential backoff (5s → 10s → 20s → 40s...)
            else:
                print(f"Error in Gemini API call: {e}")
                break  # Exit loop for non-429 errors

    print("Maximum retries reached. Using default scores.")
    return {
        "Relevance": {"score": 5, "explanation": "Evaluation uncertain due to API error."},
        "Coherence": {"score": 5, "explanation": "Evaluation uncertain due to API error."},
        "Factual Accuracy": {"score": 5, "explanation": "Evaluation uncertain due to API error."},
        "Bias & Toxicity": {"score": 5, "explanation": "Evaluation uncertain due to API error."},
        "Fluency": {"score": 5, "explanation": "Evaluation uncertain due to API error."},
        "Image Alignment": {"score": 5, "explanation": "Evaluation uncertain due to API error."},
        "Creativity": {"score": 5, "explanation": "Evaluation uncertain due to API error."}
    }


In [81]:

# Load chatbot conversations from JSON file
json_file = "human_bot_conversation_part_0.json"

if os.path.exists(json_file):
    with open(json_file, "r", encoding="utf-8") as file:
        conversations = json.load(file)
else:
    raise FileNotFoundError(f"JSON file not found: {json_file}")

In [82]:
# Run evaluation on all conversations
evaluation_results = []
# print(conversations)
for entry in conversations:
    conversation_text = entry.get("conversation", "")
    if conversation_text:
        evaluation_data = evaluate_conversation_with_gemini(conversation_text)
        #print(evaluation_data)
        evaluation_results.append({
            "conversation_id": entry.get("conversation_id", len(evaluation_results) + 1),
            "evaluation_scores": evaluation_data
        })

Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Rate limit exceeded! Retrying in 5 seconds... (1/7)
Rate limit exceeded! Retrying in 10 seconds... (2/7)
Rate limit exceeded! Retrying in 20 seconds... (3/7)
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
Successfully extracted JSON!
S

In [83]:
# Save results
evaluation_output_file = "conversation_evaluation_results_gemini.json"
with open(evaluation_output_file, "w", encoding="utf-8") as file:
    json.dump(evaluation_results, file, indent=4)

print(f"\nEvaluation results saved to {evaluation_output_file}")


Evaluation results saved to conversation_evaluation_results_gemini.json
