In [2]:
%%capture --no-stderr
!pip install --upgrade --quiet  langchain langchain-community langchainhub beautifulsoup4 datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files = "/kaggle/input/dataset/baseline_data_complete.csv")

In [1]:
JUDGE_PROMPT_Faithfulness = """
You will be given a question, an answer, a context, and a ground truth pair. Your task is to extract all atomic facts from the answer and the corresponding part in the context that related the questions in the form of triples: (subject, predicate, object). Each fact should be represented as a triple where:

- `subject`: the entity performing or being described.
- `predicate`: the action, relation, or property of the subject.
- `object`: the target, recipient, or detail associated with the predicate.

Your task is to ensure that:

1. Each triple contains exactly one subject, one predicate, and one object.
2. Please only use the word in the context or answer
3. If a sentence contains multiple facts, list them all as separate triples.
4. You must use all provided context.

Now here are the details:
Question：{question}
Answer: {answer}
Context: {context}

The structure of your feedback should be as follows:

Feedback::

Context Facts:
1. (subject: subject, predicate: predicate, object: object)
2. (subject: subject, predicate: predicate, object: object)

Answer Facts:
1. (subject: subject, predicate: predicate, object: object)
2. (subject: subject, predicate: predicate, object: object)

Make sure that each fact from the answer is supported by an appropriate fact from the context in the same order.
"""



In [31]:
JUDGE_PROMPT_Response_Relevancy = """
You will be given a context.
Your task is to generate the related questions based on the contexts. The idea is to see what kind of questions will need this context.

Here are the steps for evaluation:

### 1. **Generate questions**:
- Based on the context given, generated 3 different corresponding questions. The question should be able to reflect the content of the context.
- Please do not contain any specific information pointing to context, like "first context" or "second context"
### 2. **Provide Feedback**:
- **Generated Questions**: 3 AI-generated questions for the context

**Instructions for the evaluation**:
- Carefully read the question, answer, context, and ground truth.
- Generate 3 questions for the based on the contexts.
- Provide the **Generated Questions**

Now here are the details:

Context: {context}

Provide your feedback with the following format:

Feedback::: 
Generated Questions: 
1. question1
2. quetions2
3. question3

PLEASE Do Not GIVE ANY OTHER INFORMATION.
Thank you for your evaluation.
"""

In [126]:
JUDGE_PROMPT_Factual Correctness = """
You will be given an answer and ground truth pair, both in (subject, predicate, object) tuple.
Your task is to provide a 'Factual Correctness' score, evaluating how factually consistent the generated answer is with the ground truths. The score ranges from 0 to 1, where 1 indicates perfect consistency.

Here are the steps for evaluation:

### 2. **Evaluate Factual Correctness**:
- After extracting the atomic facts, evaluate how consistent each fact is with the **retrieved context**. Check if the facts are supported by information from the context.
- **True Positive, False Negative and False Positive Formular**:  
  True Positive (TP) = Number of statements in generated answers that are present in ground truths
  False Negative (FN) = Number of statements in ground truth that are not present in generated answers
 
- **Factual Correctness Formula**
  |Factual Correctness| = TP / (TP + FN)
### 3. **Provide Feedback**:
- **Factual Correctness**: A score between 0 and 1 that represents how factually consistent the response is with the reference. 

**Instructions for the evaluation**:
- Carefully read the answer and ground truth.
- Check each answer fact against the ground truths facts to determine if it’s supported.
- Check ground truths facts against the each answer fact to determine if it’s supported.
- Calculate the TP, FP, FN value 
- calculate the  Factual Correctness with the Formula
- Provide a **Factual Correctness** score.

Now here are the details:

Answer: {answer}
Ground Truth: {ground_truth}

Provide your feedback with the following format:

Feedback:::
Factual Correctness: [Value between 0 and 1]  
 """

In [127]:
def extract_judge_score(answer: str, split_str: str) -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print(e)
        return None

def extract_judge_score(answer: str, split_str: str) -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print(e)
        return None

def extract_judge_facts(answer: str, split_str: str) -> list:
    try:
            # If split_str is present in the answer, split the answer accordingly
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        
        # Use a regex pattern to capture each atomic factual statement (numbered items)
        facts = re.findall(r'(\d+)\.\s*\((.*?)\)', rating)
        
        # Convert the extracted data into a structured list of tuples (numbered fact, atomic fact)
        extracted_facts = [(fact[0], fact[1]) for fact in facts]
        
        return extracted_facts
    except Exception as e:
        print(f"Error: {e}")
        return []

import re

def extract_judge_questions(answer: str, split_str: str = "Feedback::: Generated Questions:") -> list:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        questions = re.findall(r'(\d+)\.\s*(.+)', rating)
        extracted_questions = [q[1].strip() for q in questions]
        return extracted_questions
    except Exception as e:
        print(f"Error: {e}")
        return []

def generate_improved_judge_response(row):
    prompt = JUDGE_PROMPT.format(answer=row['Answer Facts String'], ground_truth=row['Related Grounds String'])
    #print(prompt)
    client = OpenAI(
        api_key="sk-3bJrZlYFjViYQEDsC2D6F3F9B3Ae46208eDfDdE7E24dE512",
        base_url="https://api.openai.com/v1"
    )
    client.base_url = "https://ai-yyds.com/v1"

    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are proficient in extract atomic facts from sentences and compare the meaning of different sentences."},
            {"role": "user", "content": prompt}
        ],
        model="gpt-4o-mini"
    )
    generated_text  = response.choices[0].message.content
    return generated_text


def extract_facts(answer: str) -> dict:
    try:
        # 定义结果字典
        results = {"Context Facts": [], "Answer Facts": []}
        
        # 定义分割标记
        context_maker = "Context Facts:"
        answer_marker = "Answer Facts:"
        
        context_part = re.search(f"{context_maker}(.*?){answer_marker}", answer, re.DOTALL)
        answer_part = re.search(f"{answer_marker}(.*)", answer, re.DOTALL)
        
        def parse_facts(text):
            facts = re.findall(r'\(\s*subject:\s*(.*?),\s*predicate:\s*(.*?),\s*object:\s*(.*?)\s*\)', text)
            return [{"subject": fact[0].strip(), "predicate": fact[1].strip(), "object": fact[2].strip()} for fact in facts]
        
        if context_part:
            context_text = context_part.group(1).strip()
            results["Context Facts"] = parse_facts(context_text)
        
        if answer_part:
            answer_text = answer_part.group(1).strip()
            results["Answer Facts"] = parse_facts(answer_text)
        
        return results
    
    except Exception as e:
        print(f"Error: {e}")
        return {}
        


In [136]:
def build_fact_sentences(answer_facts):
    fact_sentences = [
        f"{index + 1}. (Subject:{fact['subject']}, Predicate:{fact['predicate']}, Object:{fact['object']})"
        for index, fact in enumerate(answer_facts)
    ]
    return "\n".join(fact_sentences) 

def build_related_context_string(related_contexts):
    return " ".join(related_contexts)

def process_data_list(data_list):
    results = []
    for data in data_list:
        answer_facts_string = build_fact_sentences(data['Answer Facts'])
        related_context_string = build_fact_sentences(data['Ground Truth Facts'])
        result_dict = {
            "Answer Facts String": answer_facts_string,
            "Related Grounds String": related_context_string
        }
        results.append(result_dict)
    return results

result_list = process_data_list(loaded_facts_results)
print(result_list[0])

{'Answer Facts String': '1. (Subject:The music director of the Quebec Symphony Orchestra, Predicate:is, Object:Fabien Gabel)', 'Related Grounds String': '1. (Subject:The music director of the Quebec Symphony Orchestra, Predicate:is, Object:Fabien Gabel)'}


In [134]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="/kaggle/input/dataset/baseline_data_complete.csv")
def clean_context(context):
    cleaned_context = context.lstrip("'[\"")
    cleaned_context = cleaned_context.rstrip("]\"'")

    return cleaned_context

    return cleaned_context

def update_ground_truths(example):  
    try:  
        # Iterate over the contexts in the "ground_truths" field
            # Strip unwanted characters from both ends of each context string
        cleaned_context = clean_context(example['contexts'])
        # Update the context in the ground_truths list with the cleaned context
        example["contexts"] = cleaned_context
    except Exception as e:  
        print(f"Error: {e}")  
    
    return example  

dataset = dataset.map(update_ground_truths)
print(result_list[0])

    

{'Answer Facts String': '1. (Subject:The music director of the Quebec Symphony Orchestra, Predicate:is, Object:Fabien Gabel)', 'Related Grounds String': '1. (Subject:The music director of the Quebec Symphony Orchestra, Predicate:is, Object:Fabien Gabel)'}


In [137]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import openai
import csv
import time
from datasets import load_dataset
from threading import Semaphore
from openai import OpenAI

MAX_RATE = 5  #
semaphore = Semaphore(MAX_RATE)
def process_data(i, data):
    try:
        with semaphore:
            start_time = time.time()

            report = generate_improved_judge_response(data)
            facts = extract_judge_score(report, "Factual Correctness:") 

            elapsed_time = time.time() - start_time
            if elapsed_time < 1 / MAX_RATE:
                time.sleep(1 / MAX_RATE - elapsed_time)

        return i, facts

    except Exception as e:
        print(f"Error in process_data for index {i}: {e}")
        return i, []  # 返回默认空 facts

facts_results = [None] * len(result_list)

with ThreadPoolExecutor(max_workers=MAX_RATE) as executor:
    futures = [executor.submit(process_data, data, result_list[data]) for i, data in enumerate(indexes)]

    for future in tqdm(futures, total=len(futures)):
        try:
            i, facts = future.result()
            facts_results[i] = facts
        except Exception as e:
            print(f"Error processing future: {e}")
            facts_results[i] = -1
        
print("Processing complete.")

100%|██████████| 207/207 [17:03<00:00,  4.95s/it]

Processing complete.





In [74]:
import json
with open("/kaggle/working/new_questions.json", "r", encoding="utf-8") as file:
    loaded_facts_results = json.load(file)

print(loaded_facts_results[8])

['What makes the TIFF Bell Lightbox a unique venue for screening classic films?', 'How does the audience\'s diversity enhance the experience of watching "Dial M for Murder"?', 'What factors contribute to the acclaim of "Dial M for Murder" as a classic film?']


In [47]:
for i, question in enumerate(facts_results):
    if question != None:
       loaded_facts_results[i] = question
        

In [60]:
from tqdm import tqdm
from langchain_openai import ChatOpenAI, OpenAIEmbeddings 
import openai
from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
dataset = load_dataset('csv', data_files="/kaggle/input/dataset/baseline_data_complete.csv")
client = OpenAI(
    api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
    base_url="https://api.openai.com/v1"
)
client.base_url="https://ai-yyds.com/v1"
    
def get_text_embedding(text):
    embedding = client.embeddings.create(input=text, model="text-embedding-3-large"

).data[0].embedding
    return embedding

def calculate_cosine_similarity(vector1, vector2):
    return cosine_similarity([vector1], [vector2])[0][0]
    
results = []
for i in tqdm(range(len(indexes))):
    similarities = []
    emb1 = get_text_embedding(dataset['train']['question'][indexes[i]["index"]])
    for question in loaded_facts_results[indexes[i]["index"]]:
        emb2 = get_text_embedding(question)
        similarity = calculate_cosine_similarity(emb1,emb2)
        similarities.append(similarity)
    results.append({"index": indexes[i]["index"], "similarities": similarities})



100%|██████████| 424/424 [11:28<00:00,  1.62s/it]


In [75]:
from tqdm import tqdm
from langchain_openai import ChatOpenAI, OpenAIEmbeddings 
import openai
from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
dataset = load_dataset('csv', data_files="/kaggle/input/dataset/baseline_data_complete.csv")
client = OpenAI(
    api_key="sk-zjhl7UX5N1fEXC1P83C84b5a154146EaA75e4f4aE933E9Ef",
    base_url="https://api.openai.com/v1"
)
client.base_url="https://ai-yyds.com/v1"
def get_text_embedding(text):
    embedding = client.embeddings.create(input=text, model="text-embedding-3-large"

).data[0].embedding
    return embedding

def calculate_cosine_similarity(vector1, vector2):
    return cosine_similarity([vector1], [vector2])[0][0]
similarities = []
emb1 = get_text_embedding(dataset['train']['question'][1790])
for question in loaded_facts_results[1790]:
    emb2 = get_text_embedding(question)
    similarity = calculate_cosine_similarity(emb1,emb2)
    similarities.append(similarity)
print(similarities)

[]
