In [None]:
from llm_response import MentalHealthBot
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.neighbors import NearestNeighbors
import json
import re
import os
from tenacity import retry, stop_after_attempt, wait_exponential, RetryError
from typing import List
from sentence_transformers import SentenceTransformer  # Changed from gensim
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')


class TooManyRequests(Exception):
    pass


class MentalHealthRAGEvaluator:
    def __init__(self, k=3, test_size=0.5, random_state=42):
        self.bot = MentalHealthBot()
        self.k = k
        self.test_size = test_size
        self.random_state = random_state
        self.output_file = "output-e5-large-v2.csv"
        # Load E5-large-v2 model
        self.embedding_model = SentenceTransformer('intfloat/e5-large-v2')

    def get_bot_response(self, prompt):
        try:
            response = self.bot.chat_stream(prompt)
        except Exception as e:
            raise TooManyRequests("API quota exceeded (429)")
        json_content = re.search(r'\{.*\}', response, re.DOTALL)
        return json.loads(json_content.group()) if json_content else None

    def _get_embeddings(self, texts: List[str]) -> np.ndarray:
        # Prepend "query: " to each text as recommended for E5 models
        prefixed_texts = ["query: " + text for text in texts]
        # Get embeddings from E5 model
        return self.embedding_model.encode(prefixed_texts, normalize_embeddings=True)

    def _save_response(self, text: str, label: str, response: dict):
        data = {
            'text': [text],
            'true_label': [label],
            'response': [json.dumps(response)],
            'timestamp': [pd.Timestamp.now()],
            'embedding_model': ['e5-large-v2']  # Updated model name
        }
        df = pd.DataFrame(data)
        if os.path.exists(self.output_file):
            df.to_csv(self.output_file, mode='a', header=False, index=False)
        else:
            df.to_csv(self.output_file, index=False)

    def evaluate(self, data_path: str):
        df = pd.read_csv(data_path).dropna()
        train_df = df
        test_df = pd.read_csv("Combined-Data.csv").dropna()

        # Sample 1000 random rows from test data
        test_df = test_df.sample(n=1000, random_state=self.random_state).reset_index(drop=True)

        train_texts = train_df.iloc[:, 0].tolist()
        train_embeddings = self._get_embeddings(train_texts)

        test_texts = test_df.iloc[:, 1].tolist()
        test_embeddings = self._get_embeddings(test_texts)
        test_labels = test_df.iloc[:, 2].tolist()

        self.nn = NearestNeighbors(n_neighbors=self.k).fit(train_embeddings)
        distances, indices = self.nn.kneighbors(test_embeddings)

        pred_labels = []
        try:
            for i, (text, label) in enumerate(zip(test_texts, test_labels)):
                neighbor_labels = [train_df.iloc[j, 1] for j in indices[i]]
                pred_label = max(set(neighbor_labels), key=neighbor_labels.count)
                pred_labels.append(pred_label)

                response = self.get_bot_response(f"Analyze this mental health statement: {text}")
                if response:
                    self._save_response(text, label, response)

        except TooManyRequests:
            print("Received 429 error. Exiting early to preserve existing results.")

        if len(pred_labels) < len(test_labels):
            test_labels = test_labels[:len(pred_labels)]

        accuracy = accuracy_score(test_labels, pred_labels)
        f1 = f1_score(test_labels, pred_labels, average='weighted')
        report = classification_report(test_labels, pred_labels)

        unique_classes = sorted(set(test_labels))
        per_class_acc = {}
        for cls in unique_classes:
            mask = np.array(test_labels) == cls
            per_class_acc[cls] = accuracy_score(
                np.array(test_labels)[mask],
                np.array(pred_labels)[mask]
            )

        print("\nEvaluation Results:")
        print(f"Overall Accuracy: {accuracy:.4f}")
        print(f"Weighted F1 Score: {f1:.4f}")
        print("\nPer-Class Accuracy:")
        for cls, acc in per_class_acc.items():
            print(f"- {cls}: {acc:.4f}")
        print("\nClassification Report:")
        print(report)

        return {
            'overall_accuracy': accuracy,
            'weighted_f1_score': f1,
            'per_class_accuracy': per_class_acc,
            'classification_report': report,
            'output_file': self.output_file
        }


if __name__ == "__main__":
    evaluator = MentalHealthRAGEvaluator(
        k=3,
        test_size=0.005,
        random_state=42
    )
    results = evaluator.evaluate("Subset-Data.csv")
    print("Evaluation complete. Results saved to output.csv")