In [None]:
%%capture --no-stderr
!pip install --upgrade --quiet  langchain langchain-community langchainhub beautifulsoup4 datasets
!pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl pacmap datasets langchain-community ragatouille datasets
!pip install langchain-openai
!pip install ragas
!pip install -U bitsandbytes
!pip install transformers accelerate bitsandbytes flash-attn
!pip install sentence-transformers

In [None]:
import bs4
from langchain import hub
import torch
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from openai import OpenAI
from datasets import load_dataset
from langchain.docstore.document import Document as LangchainDocument
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import AutoTokenizer
import tqdm

EMBEDDING_MODEL_NAME = "thenlper/gte-small"

ds = load_dataset("neural-bridge/rag-dataset-12000")
train_ds = ds["train"]
test_ds = ds["test"]

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=doc["context"],
    )
    for i, doc in enumerate(test_ds)
]

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique
    

#docs_processed = split_documents(
#    512,  # We choose a chunk size adapted to our model
#    RAW_KNOWLEDGE_BASE,
#    tokenizer_name=EMBEDDING_MODEL_NAME,
#)

docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += [doc]


In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(docs_processed[i].page_content)) for i in range(len(docs_processed))]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

In [None]:
from langchain.schema import BaseRetriever
from pydantic import BaseModel

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)
        
retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever()

# 2. Incorporate the retriever into a question-answering chain.
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [None]:
#### from langchain.llms.base import LLM
from typing import List, Optional
import requests
from datasets import Dataset
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, openai_api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
                openai_api_base="https://ai-yyds.com/v1")
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
dataset_base = load_dataset("csv", data_files = "/kaggle/input/complete-base-data/baseline_data_complete.csv")
answers = []
questions = []
contexts = []
ground_truths = []
try:
    for index in indexes:
        result = rag_chain.invoke({"input": dataset_base['train']["question"]})['answer']
        context = [docs.page_content for docs in retriever.get_relevant_documents(dataset_base['train']["question"])]
        answers.append(result)
        contexts.append(context)
        questions.append(dataset_base['train'][index]["question"])
        ground_truths.append(dataset_base['train'][index]["ground_truths"])
    print(len(context))
    print("Generated Answer:", context)
except Exception as e:
    print(f"Error during chain invocation: {e}")

#data = {
#    "question":questions,
 #   "answer": answers,
 ##   "contexts": contexts,
  #  "ground_truths": ground_truths
#}
# Convert dict to dataset
#dataset = Dataset.from_dict(data) 


In [None]:
from datasets import Dataset
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

answers = []
contexts = []
ground_truths = []
batch_size = 108
questions = test_ds["question"]
queries = []
# Split the questions into batches of 108
batches = [
    [(i, questions[i]) for i in range(start, min(start + batch_size, len(questions)))]
    for start in range(0, len(questions), batch_size)
]

def process_query(query, id):
   try:
        answer = rag_chain.invoke({"input": query})
        
        context = [docs.page_content for docs in retriever.get_relevant_documents(query)]
        
        return answer["answer"], context, id
   except Exception as e:
        print(f"Error processing query ID {id}: {e}")
        return None, None, id

def process_batch(batch):
    batch_answers = []
    batch_contexts = []
    
    # Use a ThreadPoolExecutor but for smaller batches
    with ThreadPoolExecutor(max_workers=8) as executor:  # Limit number of threads to prevent overload
        futures = {executor.submit(process_query, query, id): query for (id, query) in batch}
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                answer, context, id = future.result()
                if answer is not None and context is not None:  # 只处理成功的结果
                    batch_answers.append(answer)
                    batch_contexts.append(context)
                    ground_truths.append(test_ds["answer"][id])
                    queries.append(test_ds["question"][id])
            except Exception as e:
                print(f"Error in future processing: {e}")
    return batch_answers, batch_contexts
    
for batch in tqdm(batches):
    batch_answers, batch_contexts = process_batch(batch)
    answers.extend(batch_answers)
    contexts.extend(batch_contexts)
    
# To dict
data = {
    "question":queries,
    "answer": answers,
    "contexts": contexts,
    "reference": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [None]:

JUDGE_PROMPT = """
You will be given the answer and ground truth couple.
Your task is to extract all atomic facts from a answers and ground truths in the form of triples: (subject, predicate, object). Each fact must be expressed as a triple where:
- `subject` is the entity performing or being described.
- `predicate` is the action, relation, or property of the subject.
- `object` is the target, recipient, or detail associated with the predicate.

Ensure that:
1. Each triple contains exactly one subject, one predicate, and one object.
2. If a sentence contains multiple facts, list them all as separate triples.
3. Be precise and concise in extracting triples from complex sentences.

YOU MUST GIVE the Ground Truth Facts: and Answer Facts: in your feedback

Now here are the question, answer, contexts and ground truths.

Answer: {answer}
GroundTruth: {groundTruth}

Provide your feedback as follows:

Feedback:::
Ground Truth Facts:
1. (subject: subject, predicate:predicate, object:object)
2. (subject: subject, predicate:predicate, object:object)
...

Answer Facts:
1. (subject: subject, predicate:predicate, object:object)
2. (subject: subject, predicate:predicate, object:object)
...
Example: (subject: I, predicate: come, object: home)
"""



In [None]:
JUDGE_PROMPT = """
You will be given a question, answer, context, and ground truth pair.
Your task is to generate the related questions based on the contexts. The idea is to see what kind of questions will need this context.

Here are the steps for evaluation:

### 1. **Generate questions**:
- Based on the context given, generated 3 different corresponding questions. The question should be able to reflect the content of the context.

### 2. **Provide Feedback**:
- **Generated Questions**: 3 AI-generated questions for the context

**Instructions for the evaluation**:
- Carefully read the question, answer, context, and ground truth.
- Generate 3 questions for the based on the contexts.
- Provide the **Generated Questions**

Now here are the details:

Question: {question}
Answer: {answer}
Context: {context}
GroundTruth: {groundTruth}

Provide your feedback with the following format:

Feedback::: 
Generated Questions: 
1. question1
2. quetions2
3. question3

PLEASE Do Not GIVE ANY OTHER INFORMATION.
Thank you for your evaluation.

 """

In [None]:
JUDGE_PROMPT = """
You will be given a question, answer, context pair. Your task is to extract all **atomic facts** from the answer in the form of triples: (subject, predicate, object). Each fact should be represented as a triple where:

- `subject`: the entity performing or being described.
- `predicate`: the action, relation, or property of the subject.
- `object`: the target, recipient, or detail associated with the predicate.

You also need to extract the **related contexts**: the corresponding part in the context that supports the atomic facts you generated.
Your task is to ensure that:

1. Each triple contains exactly one subject, one predicate, and one object.
2. Please use the original parts in contexts for **related contexts**
3. If a sentence contains multiple facts, list them all as separate triples.
4. If one context supports multiple answers, please repeat the context for multiple times with the same order of the answer facts.
5. You must keep the order of the **Related Context** and **Answer Facts** the same. It means, if the first related context must support the first atomic facts.
6. If no supported context can be found leave an empty string there.

Now here are the details:
Question：{question}
Answer: {answer}
Context: {context}

The structure of your feedback should be as follows:

Feedback::

Related Context:
1. context1
2. context2

Answer Facts:
1. (subject: subject, predicate: predicate, object: object)
2. (subject: subject, predicate: predicate, object: object)

Make sure that each fact from the answer is supported by an appropriate fact from the context in the same order.
"""

In [None]:
def extract_judge_score(answer: str, split_str: str) -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print(e)
        return None

def extract_judge_score(answer: str, split_str: str) -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print(e)
        return None

def extract_judge_facts(answer: str, split_str: str) -> list:
    try:
        # If split_str is present in the answer, split the answer accordingly
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        
        # Use a regex pattern to capture each atomic factual statement (numbered items)
        facts = re.findall(r'(\d+)\.\s*\((.*?)\)', rating)
        
        # Convert the extracted data into a structured list of tuples (numbered fact, atomic fact)
        extracted_facts = [(fact[0], fact[1]) for fact in facts]
        
        return extracted_facts
    except Exception as e:
        print(f"Error: {e}")
        return []

import re

def extract_judge_questions(answer: str, split_str: str = "Feedback::: Generated Questions:") -> list:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        
        questions = re.findall(r'(\d+)\.\s*(.+)', rating)
        
        extracted_questions = [q[1].strip() for q in questions]
        
        return extracted_questions
    except Exception as e:
        print(f"Error: {e}")
        return []

def generate_improved_judge_response(row):
    prompt = JUDGE_PROMPT.format(question=row['question'], answer=row['answer'], context=row['contexts'], groundTruth = row['ground_truths'])
    #print(prompt)
    client = OpenAI(
        api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
        base_url="https://api.openai.com/v1"
    )
    client.base_url = "https://ai-yyds.com/v1"

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are proficient in extract atomic facts from sentences and compare the meaning of different sentences."},
            {"role": "user", "content": prompt}
        ],
        model="gpt-4o-mini"
    )
    generated_text  = response.choices[0].message.content
    return generated_text


def extract_facts(answer: str) -> dict:
    try:
        results = {"Ground Truth Facts": [], "Answer Facts": []}
        
        context_marker = "Ground Truth Facts:"
        answer_marker = "Answer Facts:"
        
        context_part = re.search(f"{context_marker}(.*?){answer_marker}", answer, re.DOTALL)
        answer_part = re.search(f"{answer_marker}(.*)", answer, re.DOTALL)
        
        def parse_facts(text):
            facts = re.findall(r'\(\s*subject:\s*(.*?),\s*predicate:\s*(.*?),\s*object:\s*(.*?)\s*\)', text)
            return [{"subject": fact[0].strip(), "predicate": fact[1].strip(), "object": fact[2].strip()} for fact in facts]
        
        if context_part:
            context_text = context_part.group(1).strip()
            # Split by lines and remove numbering (e.g., 1., 2., ...)
            #cleaned_lines = [re.sub(r'^\d+\.\s*', '', line).strip() for line in lines if line.strip()]  # Clean and skip empty lines
            results["Ground Truth Facts"] = parse_facts(context_text)  # Store each line as a separate list item
        else:
            results["Ground Truth Facts"] = []
        
        if answer_part:
            answer_text = answer_part.group(1).strip()
            results["Answer Facts"] = parse_facts(answer_text)
        
        return results
    
    except Exception as e:
        print(f"Error: {e}")
        return {}

def extract_new_facts(answer, context_marker="Related Context:", answer_marker="Answer Facts"):
    results = {}

    try:
        context_part = re.search(f"{context_marker}(.*?){answer_marker}", answer, re.DOTALL)
        answer_part = re.search(f"{answer_marker}(.*)", answer, re.DOTALL)
        
        def parse_facts(text):
            facts = re.findall(r'\(\s*subject:\s*(.*?),\s*predicate:\s*(.*?),\s*object:\s*(.*?)\s*\)', text)
            return [{"subject": fact[0].strip(), "predicate": fact[1].strip(), "object": fact[2].strip()} for fact in facts]
        
        if context_part:
            context_text = context_part.group(1).strip()
            # Split by lines and remove numbering (e.g., 1., 2., ...)
            lines = context_text.splitlines()
            cleaned_lines = [re.sub(r'^\d+\.\s*', '', line).strip() for line in lines if line.strip()]  # Clean and skip empty lines
            results["Related Context"] = cleaned_lines  # Store each line as a separate list item
        else:
            results["Related Context"] = []

        if answer_part:
            answer_text = answer_part.group(1).strip()
            results["Answer Facts"] = parse_facts(answer_text)
        else:
            results["Answer Facts"] = [] 
        
        return results
    
    except Exception as e:
        print(f"Error: {e}")
        return {}

In [None]:
from tqdm import tqdm
from langchain_openai import ChatOpenAI, OpenAIEmbeddings 
import openai
from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
#dataset = load_dataset('csv', data_files="/kaggle/input/dataset/baseline_data_complete.csv")
client = OpenAI(
    api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
    base_url="https://api.openai.com/v1"
)
client.base_url="https://ai-yyds.com/v1"
    
def get_text_embedding(text):
    embedding = client.embeddings.create(input=text, model="text-embedding-3-large"

).data[0].embedding
    return embedding

def process_fact(fact, contexts):
    emb1 = get_text_embedding(build_from_facts(fact))
    similarities = []
    for context in contexts:
        emb2 = get_text_embedding(build_from_facts(context)) 
        similarity = calculate_cosine_similarity(emb1, emb2)
        similarities.append(similarity)
    return similarities

def process_data_multithreaded(loaded_facts_results):
    results = [None] * len(loaded_facts_results) 
    with ThreadPoolExecutor() as executor:
        futures = []
        for i, data in enumerate(loaded_facts_results):
            contexts = data['Ground Truth Facts']
            facts = data['Answer Facts']

            future = executor.submit(process_facts_for_one_data, facts, contexts)
            futures.append((i, future))
        
        for index, future in tqdm(futures, total=len(futures)):
            results[index] = future.result()
    return results

def process_facts_for_one_data(facts, contexts):
    result = []
    for fact in facts:
        fact_similarities = process_fact(fact, contexts)
        result.append(fact_similarities)
    return result

results = process_data_multithreaded(loaded_facts_results)
print(results[0])





In [None]:
from tqdm import tqdm
from langchain_openai import ChatOpenAI, OpenAIEmbeddings 
import openai
from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
dataset = load_dataset('csv', data_files="/kaggle/input/complete-base-data/baseline_data_complete.csv")
client = OpenAI(
    api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
    base_url="https://api.openai.com/v1"
)
client.base_url="https://ai-yyds.com/v1"
def get_text_embedding(text):
    embedding = client.embeddings.create(input=text, model="text-embedding-3-large"

).data[0].embedding
    return embedding

def calculate_cosine_similarity(vector1, vector2):
    return cosine_similarity([vector1], [vector2])[0][0]
similarities = []
emb1 = get_text_embedding("Tenth Doctor has companion Martha Jones")
emb2 = get_text_embedding("STATS: Doctor(s): Tenth Companion(s): Martha Jones Episode(s): Daleks in Manhattan ")
similarity = calculate_cosine_similarity(emb1,emb2)
print(similarity)

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import openai
import csv
import time
from datasets import load_dataset
from threading import Semaphore

MAX_RATE = 4  #
semaphore = Semaphore(MAX_RATE)
    
dataset_base = load_dataset("csv", data_files = "/kaggle/input/complete-base-data/baseline_data_complete.csv")

def process_data(i, data):
    """处理单个数据项的函数"""
    try:
        with semaphore:
            start_time = time.time()

            report = generate_improved_judge_response(data)
            facts = extract_facts(report) 
            elapsed_time = time.time() - start_time
            if elapsed_time < 1 / MAX_RATE:
                time.sleep(1 / MAX_RATE - elapsed_time)

        return i, facts

    except Exception as e:
        print(f"Error in process_data for index {i}: {e}")
        return i, {"Ground Truth Facts": [], "Answer Facts": []}  

facts_results = [None] * len(dataset_base['train'])

with ThreadPoolExecutor(max_workers=MAX_RATE) as executor:
    futures = [executor.submit(process_data, indexes[i], dataset_base['train'][indexes[i]]) for i, data in enumerate(indexes)]

    for future in tqdm(futures, total=len(indexes)):
        try:
            i, facts = future.result()
            facts_results[i] = facts
        except Exception as e:
            print(f"Error processing future: {e}")
            facts_results[i] = {"Ground Truth Facts": [], "Answer Facts": []} 

print("Processing complete.")

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

def combine_fact(fact):
    return f"{fact['subject']} {fact['predicate']} {fact['object']}."

def calculate_similarity(context_embeddings, answer_embeddings):
    similarities = []
    
    # Calculate cosine similarity between answer embeddings and context embeddings
    cosine_sim = cosine_similarity(answer_embeddings, context_embeddings)
    
    # For each answer fact, get the max similarity score
    max_similarities = cosine_sim.max(axis=1)
    similarities.extend(max_similarities)
    
    return similarities
    
    # Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2') 
results = []
# Sample context facts and answer facts
for facts in tqdm(loaded_facts_results):
    context_facts = facts['Context Facts']
    
    answer_facts = facts['Answer Facts']

    # Combine context facts into sentences
    context_sentences = [combine_fact(fact) for fact in context_facts]
    
    # Combine answer facts into sentences
    answer_sentences = [combine_fact(fact) for fact in answer_facts]
    
    context_embeddings = model.encode(context_sentences, convert_to_tensor=True)
    answer_embeddings = model.encode(answer_sentences, convert_to_tensor=True)

    similarities = calculate_similarity(context_embeddings, answer_embeddings)
    similarities = [max(min(sim, 1), 0) for sim in similarities]
    
    # Calculate the standard deviation of the similarities
    std_dev = np.std(similarities)
    results.append({"deviation": std_dev, "similarities": similarities})


In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import openai
import csv
import time
from datasets import load_dataset
from threading import Semaphore

dataset_base = load_dataset("csv", data_files = "/kaggle/input/long-context-complete/long_context_data_complete.csv")

MAX_RATE = 4  # 每秒最多的请求次数
semaphore = Semaphore(MAX_RATE)

def process_data(i, data):
    with semaphore:
        start_time = time.time()
        report = generate_improved_judge_response(data)
        facts = extract_facts(report)
        elapsed_time = time.time() - start_time
        if elapsed_time < 1 / MAX_RATE:
            time.sleep(1 / MAX_RATE - elapsed_time)
    return i, question

questions = [None] * len(dataset_base['train'])
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(process_data, i, dataset_base['train'][i]) for i, data in enumerate(dataset_base['train'])]
    
    for future in tqdm(futures, total=len(dataset_base['train'])):
        try:
            i, question = future.result()
            questions[i] = question
        except Exception as e:
            print(f"Error processing future: {e}")
            print(i)
            questions[i] = "None"

output_file = '/kaggle/working/New_Questions_Long.csv'

# Open a CSV file in write mode
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    writer.writerow(['Questions'])
    
    for i, (question) in enumerate(questions):
        writer.writerow([question])

print(f"Questions saved to {output_file}")

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import openai
import csv
import time
from datasets import load_dataset

dataset_base = load_dataset("csv", data_files = "/kaggle/input/complete-base-data/baseline_data_complete.csv")

def process_data(i, data):
    report = generate_improved_judge_response(data)
    score = extract_judge_score(report, 'Factual Correctness:')
    facts_answers = extract_judge_facts(report, 'Corresponding Atomic Factual For Answers:')
    facts_grounds = extract_judge_facts(report, 'Corresponding Atomic Factual For Ground Truths:')
    return i, score, facts_answers, facts_grounds

scores = [None] * len(dataset_base['train'])
facts_answers = [None] * len(dataset_base['train'])
facts_grounds = [None] * len(dataset_base['train'])

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(process_data, i, dataset_base['train'][i]) for i, data in enumerate(dataset_base['train'])]
    
    for future in tqdm(futures, total=len(dataset_base['train'])):
        try:
            i, score, fact_answer, fact_ground = future.result()  # Get the result, which should be index and score
            scores[i] = score
            facts_answers[i] = fact_answer
            facts_grounds[i] = fact_ground
        except Exception as e:
            print(f"Error processing future: {e}")
            # Handle the error or assign a default value to that index (e.g., None)
            print(i)
            scores[i] = -1
            facts_answers[i] = "None"
            facts_grounds[i] = "None"
        
output_file = '/kaggle/working/New_Factual_Correctness_Base.csv'

# Open a CSV file in write mode
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # 写入表头
    writer.writerow(['Score', 'Facts_answers', 'Facts_ground_Truths'])
    
    # 写入每一行的索引、分数和事实
    for i, (score, fact_answers, fact_grounds) in enumerate(zip(scores, facts_answers, facts_grounds)):
        writer.writerow([score, fact_answers, fact_grounds])

print(f"Factual Corrrectness data saved to {output_file}")

In [None]:
from concurrent.futures import ThreadPoolExecutor
import time
import csv
from tqdm import tqdm

dataset_base = load_dataset("csv", data_files = "/kaggle/input/long-context-complete/long_context_data_complete.csv")

def process_data(i, data):
    report = generate_improved_judge_response(data)
    score = extract_judge_score(report, 'Factual Correctness:')
    facts_answers = extract_judge_facts(report, 'Corresponding Atomic Factual For Answers:')
    facts_grounds = extract_judge_facts(report, 'Corresponding Atomic Factual For Ground Truths:')
    return i, score, facts_answers, facts_grounds
    
def process_with_retries(i, data, max_retries=5, delay=1):
    retries = 0
    while retries < max_retries:
        try:
            return process_data(i, data)
        except Exception as e:
            if "429" in str(e): 
                retries += 1
                print(f"Retry {retries}/{max_retries} for index {i} due to {e}")
                time.sleep(delay * retries)
            else:
                raise
    print(f"Failed after {max_retries} retries for index {i}")
    return i, -1, "None", "None"

def process_in_batches(dataset, batch_size=100, max_workers=3, output_file='/kaggle/working/New_Factual_Correctness_Long.csv'):
    scores = [None] * len(dataset_base['train'])
    facts_answers = [None] * len(dataset_base['train'])
    facts_grounds = [None] * len(dataset_base['train'])

    total_data = len(dataset)
    
    # 按批次处理
    for start in range(0, total_data, batch_size):
        end = min(start + batch_size, total_data)
        batch_data = dataset[start:end]
        
        print(f"Processing batch: {start} to {end-1}")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_with_retries, i, data) for i, data in enumerate(batch_data)]
            
            for future in tqdm(futures, total=len(batch_data)):
                index = None
                try:
                    index, score, fact_answer, fact_ground = future.result()
                    scores[index] = score
                    facts_answers[index] = fact_answer
                    facts_grounds[index] = fact_ground
                except Exception as e:
                    print(f"Error processing future: {e}")
                    if index != None:
                        scores[index] = -1
                        facts_answers[index] = "None"
                        facts_grounds[index] = "None"

    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Score', 'Facts_answers', 'Facts_ground_Truths'])
        for i in sorted(scores.keys()):
            writer.writerow([scores[i], facts_answers[i], facts_grounds[i]])
    print(f"Processing completed. Results saved to {output_file}")

process_in_batches(dataset_base['train'], batch_size=100, max_workers=3)


In [None]:
from datasets import Dataset
from openai import OpenAI
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, openai_api_key="sk-NMcE1XM1rCIO3XVa758b5045408f4c1eBa076e7aFc624f6d",
                openai_api_base="https://ai-yyds.com/v1")
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

questions = []
answers = []
contexts = []
references = []
for index in indexes:
    question = test_ds[index]['question']
    answer = rag_chain.invoke({"input": question})['answer']
    context = [docs.page_content for docs in retriever.get_relevant_documents(question)]
    reference = test_ds[index]["answer"]
    questions.append(question)
    answers.append(answer)
    contexts.append(context)
    references.append(reference)

# To dict
data = {
    "question":questions,
    "answer": answers,
    "contexts": contexts,
    "reference": references
}
print(references)
# Convert dict to dataset
dataset = Dataset.from_dict(data)   
    
    

In [None]:
!pip install ragas

In [None]:
import os
from ragas import evaluate
from openai import OpenAI
from langchain_openai import ChatOpenAI
from datasets import Dataset, load_dataset
from ragas.metrics import (
    faithfulness,
    ResponseRelevancy,
    FactualCorrectness
)
from ragas.run_config import RunConfig
from ragas.metrics.base import MetricWithLLM, MetricWithEmbeddings
from ragas.llms import LangchainLLMWrapper
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas import RunConfig  
from ragas.metrics import *  
from langchain_openai import ChatOpenAI, OpenAIEmbeddings 
from ragas.llms import LangchainLLMWrapper  
from ragas.embeddings import LangchainEmbeddingsWrapper  
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import asyncio
from concurrent.futures import ThreadPoolExecutor
from ragas import SingleTurnSample 
import asyncio
from tqdm import tqdm
#dataset = load_dataset('csv', data_file = "/kaggle/input/complete-base-data/baseline_data_complete.csv")
emb = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
    openai_api_base="https://ai-yyds.com/v1"
)

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, openai_api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
                openai_api_base="https://ai-yyds.com/v1")
llm_wrapper = LangchainLLMWrapper(llm)
emb_wrapper= LangchainEmbeddingsWrapper(emb)

result = evaluate(dataset=dataset['train'],metrics=[Faithfulness()],llm=llm_wrapper, embeddings=emb_wrapper)

In [None]:
from ragas import SingleTurnSample 
from ragas.metrics import ResponseRelevancy

sample = SingleTurnSample(
        user_input="When was the first super bowl?",
        response="The first superbowl was held on Jan 15, 1967",
        retrieved_contexts=[
            "The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles."
        ]
    )

scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings)
await scorer.single_turn_ascore(sample)

In [None]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SemanticSimilarity
from ragas.embeddings import LangchainEmbeddingsWrapper

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from langchain.embeddings.openai import OpenAIEmbeddings
import os

os.environ["OPENAI_API_KEY"] = "sk-NMcE1XM1rCIO3XVa758b5045408f4c1eBa076e7aFc624f6d"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"
sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef
# 初始化 OpenAI Embeddings（可以替换为您实际的 embedding 模型）
evaluator_embedding = OpenAIEmbeddings()

# 包装 embeddings
embeddings_wrapper = LangchainEmbeddingsWrapper(evaluator_embedding)

sample = SingleTurnSample(
    response="The Eiffel Tower is located in Paris.",
    reference="The Eiffel Tower is located in Paris. It has a height of 1000ft."
)

scorer = SemanticSimilarity(embeddings=LangchainEmbeddingsWrapper(embeddings_wrapper))
import asyncio
score = asyncio.run(scorer.single_turn_ascore(sample))

# 打印结果
print("Semantic Similarity Score:", score)