In [6]:
import sys
# Adding the desired path to sys.path
sys.path.append('') # Appending openai library path to sys.path
# print(sys.path)

In [2]:
import openai
from openai import OpenAI

In [3]:
import os
import dotenv
from dotenv import load_dotenv
import time
import openai

# Load environment variables from .env file
load_dotenv(dotenv_path='') # Add the path to the 'env' containing the relevant OPENAI API key here, preferably to use GPT-4o
# Now you can access the environment variable
api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key=api_key)

# All Error Evaluators

### Loading the 'Civ. Pro.' Dataset
The dataset should contain the columns for: 1. Legal Context('Context' here in a dataframe 'df') 2. Question ('Question') 3. Options ('Options'), 4. Expert Answers ('Analysis') and the LLM generated responses to be evaluated for errors.

In [None]:
import pandas as pd
input_file = '' # Added the csv containing
df = pd.read_csv(input_file)
df

In [6]:
import requests
# OpenAI API call handling for timout issues
def get_text_completion_from_GPT(system_prompt, prompt, model="gpt-4o", max_retries=3, retry_delay=2):
    for attempt in range(max_retries):
        try:
            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,  # Degree of randomness of the model's output
                frequency_penalty=0,
                presence_penalty=0,
                seed = 42
            )
            return response.choices[0].message.content
        except (requests.exceptions.Timeout, requests.exceptions.RequestException) as e:
            time.sleep(retry_delay)
    # If all attempts fail, raise the last exception
    raise e

## Pemise-Level

In [7]:
def extract_relevant_info(legal_context_question_options):
    extraction_system_prompt = """You are an expert information-extraction tool. You are provided with a legal context, a multiple-choice question, options and gold standard answer. You need to:
Extract and list ALL of the important facts, important concepts, legal rules, precedents, and analogous/similar examples strictly from the provided data. 

### Instructions:
1. Do NOT solve the question.
2. Extract all the aspects and sub-points of the legal rules verbatim.
"""
    input_prompt = f"""{legal_context_question_options}"""
    extracted_text = get_text_completion_from_GPT(extraction_system_prompt, input_prompt)
    return extracted_text

def extract_relevant_info_with_expert_answer(gold_standard_answer, extracted_text):
    extraction_system_prompt = """You are an expert information-extraction tool. You are provided with extracted information from the legal context, a multiple-choice question, options. You are also provided with the expert answer (gold standard answer) to the the provided legal scenario. Extract the relevant points/considerations from it. Supplement and augment it with the previously extracted information. Present the final extraction after processing both of the provided extractions.

### Instructions:
1. Do NOT solve the question.
2. Extract all the aspects and sub-points of the legal rules verbatim.
"""
    input_prompt = f"""### Previously Extracted Information:
{extracted_text}

### Expert Answer for the given question:
{gold_standard_answer}

Do NOT try to answer the question. 
"""
    extracted_text_with_expert_answer = get_text_completion_from_GPT(extraction_system_prompt, input_prompt)
    return extracted_text_with_expert_answer

### Error Detector 1: Simple Explanation-Based Error Detector

In [8]:
simple_err_knowledge_base = f"""
This Knowledge Base contains the types of errors predominantly committed by Large Language Model(LLM) based Reasoners while generating reasoning chains. 

### 1. Misinterpretation: 

Misinterpretation is the error where the LLM misinterprets some part or entirety of the legal context or the question provided to it.
This can occur due to ambiguities, limitations in the model’s training, or the complex nature of language. Misinterpretation can affect the reliability of the model’s output, leading to incorrect information and reasoning.

Misinterpretations can occur in the following ways:
1. Failing to consider or ommitting  some important aspects of the given context, leading to a misunderstanding.
For Example: Misunderstanding a sarcastic comment as a serious statement because the context of sarcasm was missed.
2. Misinterpreting the logical relationship between ideas, such as cause and effect, or failing to follow an argument’s reasoning correctly.
For Example: Assuming that correlation implies causation.
3. Misunderstanding the meaning of a word or phrase.
For Example: Confusing homophones like "bare" and "bear."
4. Misunderstanding, specifically, numerical data, statistical information or mathematical concepts.
For Example: Misinterpreting percentages or probability, like thinking that a 30% chance of rain means it will rain 30% of the time.
5. Misunderstanding the timing or sequence of events.
For Example: Misinterpreting past events as future intentions or vice versa.
6. Misunderstanding specialized terminology used within a particular field.
For Example: Misinterpreting medical jargon without the proper context or knowledge.

### 2. Hallucination: 

Hallucinations refer to premises that appear to be coherent, plausible, or factual but are not grounded in reality or accurate information.
Factuality Hallucination: This occurs when the premises have factually incorrect content. For instance, a premise might claim that Charles Lindbergh was the first to walk on the moon, which is a factual error.
The examples of Factuality Hallucinations are:
1. Factual Inconsistency: The premise incorrectly states Yuri Gagarin as the first person to land on the Moon (the correct answer is Neil Armstrong).
2. Factual Fabrication: The premise creates a fictitious narrative about unicorns in Atlantis, claiming they were documented to have existed around 10,000 BC and were associated with royalty despite no real-world evidence to support this claim.

### 3. Irrelevant premise: 

Irrelevant premise refer to the LLM generating a premise which is not relevant in solving the question/may divert the reasoning chain from solving the question.

Example 1 of Annotative Evaluation:
Legal Context: 
Chief Justice Marshall’s Strawbridge  Rule: The requirement of complete diversity Very early on, the Supreme Court held that diversity jurisdiction is only proper if all plaintiffs are citizens of different states from all defendants. Strawbridge v. Curtiss , 7 U.S. 267 (1806). You can have multiple defendants from the same state, and multiple plaintiffs from the same state, but no plaintiff can be from the same state as any defendant or her presence will ‘‘destroy diversity.’’ (In the real world, of course, a plaintiff can cure the defect by dropping the non diverse plaintiff, or a defendant from the plaintiff’s state, to ‘‘perfect diversity.’’) A twist in applying the  Strawbridge  rule arises in cases involving corporations. The Supreme Court has held that corporations are state ‘‘citizens,’’ but courts cannot determine their citizenship by the domicile test, since a corporation can’t have ‘‘intent’’ the way a person can. Consequently, Congress has defined the ‘‘state citizenship’’ of a corporation in the diversity statute itself. 28 U.S.C. §1332(c)(1) provides that a corporation is a citizen of the state in which it is incorporated, and also the state in which it has its principal place of business.

Question: 
In which of the following cases would the federal court  lack diversity jurisdiction? (Assume in each case that the suit is for more than $75,000.)
 
Options:
A.  Madison, from Virginia, Jefferson, from Virginia, and Gerry, from Massachusetts, sue Hamilton, from New York and Franklin, from Pennsylvania. 
B.  Madison, from Virginia, sues Lafayette, from Maryland, and Washington Corporation, incorporated in Delaware with its principal place of business in Maryland, and a large office in Virginia. 
C.  Madison, from Virginia, sues Adams Corporation, incorporated in Delaware with its principal place of business in Virginia. 
D.  The court lacks jurisdiction in choices B and C.

LLM's reasoning chain:
1. The Strawbridge Rule requires that all plaintiffs be citizens of different states from all defendants for diversity jurisdiction to be proper. 
Error Category: NO ERROR 
Explanation: Correct restatement of the rule.
2. In option A, all plaintiffs (Madison, Jefferson, and Gerry) are from the same state (Virginia), while all defendants (Hamilton and Franklin) are from different states (New York and Pennsylvania), respectively. Therefore, diversity jurisdiction is proper in this case. 
Error Category: ERROR 
Explanation: Gerry is actually from Massachusetts. (Factual Inconsistency Hallucination)
3. In option B, Madison is from Virginia, Lafayette is from Maryland, and Washington Corporation is incorporated in Delaware with its principal place of business in Maryland. Since Madison is from the same state as one defendant (Lafayette), diversity jurisdiction is not proper in this case. 
Error Category:  ERROR  
Explanation: In the second sentence, it is factually incorrect that Madison (Virginia) and Lafayette (Maryland) are from the same state.
Error Category: ERROR 
Explanation: The LLM has misinterpreted that Madison is from the same state as Lafayette disrupting the diversity of the case. The LLM has generates facts correctly at first (Madison is from Virginia, Lafayette is from Maryland) but later misinterpreted Madison and Lafayette being from the same state.
4. In option C, Madison is from Virginia, and Adams Corporation is incorporated in Delaware with its principal place of business in Virginia. Since all plaintiffs and defendants are from different states, diversity jurisdiction is proper in this case.
Error Category: ERROR  
Explanation: It is stated in the legal context that a corporation is a citizen of both the state it is incorporated in and its principal place of business, but LLM misunderstood/ommitted this aspect. Hence the corporation is not diverse from Madison as both of them are from Virginia.

[Final Answer]
The court lacks jurisdiction in choice B.
"""
def simple_err_checker(row, legal_context_question_options, llm_rc):
    simple_err_checker_system_prompt =f"""You are an expert in identifying errors in provided reasoning chains. Specifically, these reasoning chains are obtained from a legal MCQA task. From the reasoning chain, you have to classify each reasoning step with the presence/absence of an error in the step. If you do not find any error in reasoning step, assign "NO ERROR" label to it. To understand some errors which can occur, please refer to provided knowledge base. While finding errors in provided reasoning chains, you can refer to gold-standard reasoning chain for better understanding.

### Knowledge Base of the Errors:
{simple_err_knowledge_base}

### Trigger: 
The user submits a legal context, question and options and reasoning chains 1 and 2. Reasoning Chain 1 is the gold-standard answer. Reasoning Chain 2 is the reasoning chain provided by a LLM to solve the question.

### Instructions: 
Take your time to reiterate the clues given in the legal problem. Take a deep breath and follow these IMPORTANT POINTS given below:
1. Keep in mind that in the reasoning chain, only final statement with the final answer is the conclusion. All preceding statements are premises.
2. You have to find the errors ONLY in the premises, not the conclusion.
3. Take your time to go through ALL the errors given in the knowledge base above. Assign the explanation which matches the most with one of them.
4. Keep in mind that each premise can contain multiple errors of the same or different types.
5. Carefully go through the legal context and gold standard reasoning chain before deciding whether some premise is irrelevant or not.

### Structure of Response:
Suppose if there are 2 premises and 1 conclusion in provided reasoning chain, structure your response in the following way:
Premise 1: <Content of premise 1>
Error Category: [ERROR/NO ERROR]
Explanation: <Justify your error category>

Premise 2: <Content of premise 2>
Error Category: [ERROR/NO ERROR]
Explanation: <Justify your error category>

Conclusion: <Content of Conclusion>
[No error detection to be done for the conclusion!]
"""
    input_prompt = "Legal Context:\n" + legal_context_question_options
    input_prompt = input_prompt + f"\nReasoning Chain 1:\n{row['Analysis']}\nReasoning Chain 2:\n{llm_rc}"
    simple_error_check = get_text_completion_from_GPT(simple_err_checker_system_prompt, input_prompt)
    simple_error_check_summary = error_summarizer(simple_error_check)
    return simple_error_check, simple_error_check_summary

### Error Detector 2: Misinterpretation Detection

In [9]:
mis_knowledge_base = f"""
This Knowledge Base contains the types of misinterpretations predominantly committed by Large Language Model(LLM) based Reasoners while generating reasoning chains.

Misinterpretation is the error where the LLM misinterprets some part or entirety of the legal context or the question provided to it.
This can occur due to ambiguities, limitations in the model’s training, or the complex nature of language. Misinterpretation can affect the reliability of the model’s output, leading to incorrect information and reasoning.

Misinterpretation errors can occur in the following ways:
1. Failing to consider some important aspects of the given context, leading to a misunderstanding.
Example: Misunderstanding a sarcastic comment as a serious statement because the context of sarcasm was missed.

2. Misinterpreting the logical relationship between ideas, such as cause and effect, or failing to follow an argument’s reasoning correctly.
Example: Assuming that correlation implies causation.

3. Misunderstanding the meaning of a word or phrase.
Example: Confusing homophones like "bare" and "bear."

4. Drawing incorrect inferences from the prior-generated text by LLMs.
Example: Inferring that someone is angry based on their terse email, when they were simply in a hurry.

5. Misunderstanding, specifically, numerical data, statistical information or mathematical concepts.
Example: Misinterpreting percentages or probability, like thinking that a 30% chance of rain means it will rain 30% of the time.

6. Misunderstanding the timing or sequence of events.
Example: Misinterpreting past events as future intentions or vice versa.

7. Misunderstanding specialized terminology used within a particular field.
Example: Misinterpreting medical jargon without the proper context or knowledge.
"""

def misinterpretation_checker(row, legal_context_question_options, llm_rc):
    misinterpretation_system_prompt =f"""You are an expert in identifying errors in provided reasoning chains. Specifically, these reasoning chains are obtained from a legal MCQA task. From the reasoning chain, you have to classify each reasoning step with the presence/absence of a misinterpretation error in the step. You also have to explain why a step is classified as such. If you do not find any misinterpretation error in reasoning step, assign "OTHER" label to it. To understand some errors which can occur, please refer to provided knowledge base. While finding errors in provided reasoning chains, you can refer to gold-standard reasoning chain for better understanding.

### Trigger:
The user submits a legal context, question and options and reasoning chains 1 and 2. Reasoning Chain 1 is the gold-standard answer. Reasoning Chain 2 is the reasoning chain provided by a LLM to solve the question.

### Instructions:
Take your time to reiterate the clues given in the legal problem. Take a deep breath and follow these IMPORTANT POINTS given below:
1. Keep in mind that in the reasoning chain, only final statement with the final answer is the conclusion. All preceding statements are premises.
2. You have to find the errors of misinterpretations in the premises, not the conclusion.
3. Assign the most appropriate type of misinterpretation in reasoning chain 2. If there is no fitting type, assign the NO MISINTERPRETATION category.

### Knowledge Base of the Errors:
{mis_knowledge_base}

### Structure of Response:
Suppose if there are 2 premises and 1 conclusion in reasoning chain 2, structure your response in the following way:
Premise 1: <Content of premise 1>
Error Category: [MISINTERPRETATION/NO MISINTERPRETATION]
Explanation: <Justify your error category>

Premise 2: <Content of premise 2>
Error Category: [MISINTERPRETATION/NO MISINTERPRETATION]
Explanation: <Justify your error category>

Conclusion: <Content of Conclusion>
Error Category: [Conclusion is not Checked for Misinterpretations]
Explanation: <Conclusion is not Checked for Misinterpretations>

Example 1 of Annotative Evaluation:
Legal Context: 
Chief Justice Marshall’s Strawbridge  Rule: The requirement of complete diversity Very early on, the Supreme Court held that diversity jurisdiction is only proper if all plaintiffs are citizens of different states from all defendants. Strawbridge v. Curtiss , 7 U.S. 267 (1806). You can have multiple defendants from the same state, and multiple plaintiffs from the same state, but no plaintiff can be from the same state as any defendant or her presence will ‘‘destroy diversity.’’ (In the real world, of course, a plaintiff can cure the defect by dropping the non diverse plaintiff, or a defendant from the plaintiff’s state, to ‘‘perfect diversity.’’) A twist in applying the  Strawbridge  rule arises in cases involving corporations. The Supreme Court has held that corporations are state ‘‘citizens,’’ but courts cannot determine their citizenship by the domicile test, since a corporation can’t have ‘‘intent’’ the way a person can. Consequently, Congress has defined the ‘‘state citizenship’’ of a corporation in the diversity statute itself. 28 U.S.C. §1332(c)(1) provides that a corporation is a citizen of the state in which it is incorporated, and also the state in which it has its principal place of business.

Question: 
In which of the following cases would the federal court  lack diversity jurisdiction? (Assume in each case that the suit is for more than $75,000.)
 
Options:
A.  Madison, from Virginia, Jefferson, from Virginia, and Gerry, from Massachusetts, sue Hamilton, from New York and Franklin, from Pennsylvania. 
B.  Madison, from Virginia, sues Lafayette, from Maryland, and Washington Corporation, incorporated in Delaware with its principal place of business in Maryland, and a large office in Virginia. 
C.  Madison, from Virginia, sues Adams Corporation, incorporated in Delaware with its principal place of business in Virginia. 
D.  The court lacks jurisdiction in choices B and C.

LLM's reasoning chain:
1. The Strawbridge Rule requires that all plaintiffs be citizens of different states from all defendants for diversity jurisdiction to be proper. 
Error Category: NO MISINTERPRETATION 
Explanation: Correct restatement of the rule.
2. In option A, all plaintiffs (Madison, Jefferson, and Gerry) are from the same state (Virginia), while all defendants (Hamilton and Franklin) are from different states (New York and Pennsylvania), respectively. Therefore, diversity jurisdiction is proper in this case. 
Error Category: NO MISINTERPRETATION  
Explanation: No Misinterpretation. Gerry is actually from Massachusetts which is Factual Hallucination
3. In option B, Madison is from Virginia, Lafayette is from Maryland, and Washington Corporation is incorporated in Delaware with its principal place of business in Maryland. Since Madison is from the same state as one defendant (Lafayette), diversity jurisdiction is not proper in this case. 
Error Category:  MISINTERPRETATION 
Explanation: Madison is not from the same state as Lafayette, but the LLM has falsed assumed that she is, which has led to misinterpretation that there is no diversity in the case 
4. In option C, Madison is from Virginia, and Adams Corporation is incorporated in Delaware with its principal place of business in Virginia. Since all plaintiffs and defendants are from different states, diversity jurisdiction is proper in this case.
Error Category: MISINTERPRETATION  
Explanation: It is stated in the legal context that a corporation is a citizen of both the state it is incorporated in and its principal place of business, but this has been misinterpreted. Hence the corporation is not diverse from Madison as both of them are from Virginia.

[Final Answer]
The court lacks jurisdiction in choice B.
Error Category: [Conclusion is not checked for misinterpretations]
"""
    input_prompt = input_prompt + f"\nReasoning Chain 1:\n{row['Analysis']}\nReasoning Chain 2:\n{llm_rc}"
    misinterpretation_check = get_text_completion_from_GPT(misinterpretation_system_prompt, input_prompt)
    misinterpretation_check_summary = error_summarizer(misinterpretation_check)
    return misinterpretation_check, misinterpretation_check_summary

### Error Detector 3: Factual Hallucinations

In [10]:
def validation_question_generation(llm_rc):
    validation_question_gen_system_prompt = """You are an expert in analysing the premises and the conclusion comprising a reasoning chain. Your task is to create validation questions to check the factuality of sentences present in the premises and the conclusion. 
### Instructions: 
Take your time to reiterate the clues given in the extracted information. Take a deep breath and follow these IMPORTANT POINTS given below:
1. Keep in mind that in the reasoning chain, only final statement with the final answer is the conclusion. All preceding statements are premises.
2. Please STRIVE to be cover all kinds of validation questions you can pose for a premise/conclusion.
3. Pose validation questions which cover the factuality about laws and their definitions comprehensively.

### Example 1 of creating validation questions:
Provided Premise: 
The first person to land on the Moon was Neil Armstrong. He was born in 1930.

Your validation questions: 
1. Is Neil Armstrong the first person to land on the Moon?
2. Was Neil Armstrong born in 1930?
3. Is Neil Armstrong a person?

### Structure of Your Response:
Suppose if there are 2 premises and 1 conclusion in provided reasoning chain, structure your response in the following way:
Premise 1: <Content of premise 1>
[Validation Questions for Premise 1]

Premise 2: <Content of premise 2>
[Validation Questions for Premise 2]

Conclusion: [No Validation done for the Conclusion]

"""
    validation_question_gen_prompt = f"""Provided premises and conclusion:
{llm_rc}
"""
    rc_with_validation_questions = get_text_completion_from_GPT(validation_question_gen_system_prompt, validation_question_gen_prompt)
    return rc_with_validation_questions

def fact_checker(rc_with_validation_questions, legal_context_question_options):
    fact_checker_system_prompt = f"""You are an expert in fact-checking the premises and the conclusion comprising a reasoning chain. Your task is to create validation questions to check the factuality of sentences present in the premises and the conclusion.
Your subsequent task is to answer the created validation questions following the premises and conclusion by STRICTLY referring to the provided background information. A premise/conclusion is FALSE if any of the answers of its validation questions is different from what is mentioned in it.   

### Background Information:
{legal_context_question_options}

### IMPORTANT Instructions:
1. Strictly adhere to the facts in the Background Information. Do NOT assume any fact outside of it.
2. Answer the validation questions by strictly verifying the information from the Background Information, NOT the premises.
3. DO NOT automatically assume the content of the premises or the conclusion to be TRUE. You are fact-checking them with the help of the validation questions.

### Structure of Your Response:
Suppose if there are 2 premises and 1 conclusion in provided reasoning chain, structure your response in the following sequential way:
DO NOT WRITE DOWN THE PREMISES!!!
[Validation Questions for Premise 1 along their answers sequentially]
[Citing evidence from Background Information to support the answer]
[Verdict on the factuality of Premise 1]

[Validation Questions for Premise 2 along their answers sequentially, citing evidence from Background Information]
[Citing evidence from Background Information to support the answer]
[Verdict on the factuality of Premise 2]

[No Validation performed on Conclusion]
"""
    input_prompt = f"{rc_with_validation_questions}"
    factuality_check_results = get_text_completion_from_GPT(fact_checker_system_prompt, input_prompt)
    factuality_check_results_summary = error_summarizer(factuality_check_results)
    return factuality_check_results, factuality_check_results_summary

### Error Detector 4: Find Irrelevant Premises

In [11]:
irr_knowledge_base = f"""
This Knowledge Base contains the the defintion of 'Irrelevant premise(s)' generated by Large Language Model(LLM) based Reasoners while generating reasoning chains.

Irrelevant Premise:
The concept of an "Irrelevant Premise" refers to a situation that introduces information or assumptions that are not pertinent to the task at hand. This can disrupt the logical flow of reasoning, leading to answers that may be less accurate, off-topic, or misleading. It's crucial in both natural language processing and logical reasoning to maintain relevance to ensure that the conclusions or answers are directly tied to the question or problem being addressed.

An irrelevant premise can occur In the following ways:

1. Distraction from Core Issue: The information introduced may divert attention from the core issue or question, potentially leading to a solution path that does not address the original query effectively.
2. Redundant Information: The premise may be factually correct but redundant or unnecessary for solving the problem, cluttering the reasoning process without adding value.
3. False Leads: In some cases, an irrelevant premise can lead to the development of arguments based on assumptions or data that do not presently apply to the situation, resulting in incorrect conclusions.

Examples
Question: What are the effects of global warming on polar bear populations?
Irrelevant Premise: While discussing the impact on polar bears, the LLM begins detailing the history of solar panel technology. This information, although related to the broader subject of environmental issues, does not directly contribute to answering the specific question about polar bears and global warming.

Question: How can we improve the efficiency of solar panels?
Irrelevant Premise: The LLM starts explaining the basics of how fossil fuels are detrimental to the environment. Although this introduces a context for why solar panels are beneficial, it does not directly address the question of improving solar panel efficiency.

Question: Who won the 2020 U.S. presidential election?
Irrelevant Premise: Instead of directly naming the winner, the LLM provides a lengthy explanation of the U.S. Electoral College system. This information, while related to U.S. elections, does not answer the specific question about the 2020 election winner.
"""
def irrelevant_premise_checker(row, legal_context_question_options, llm_rc):
    irrelevant_premise_system_prompt =f"""You are an expert in identifying errors in provided reasoning chains. Specifically, these reasoning chains are obtained from a legal MCQA task. From the reasoning chain, you have to classify each reasoning step with the presence/absence of an Irrelevant Premise in the step. If you do not find any Irrelevant Premise in reasoning step, assign "RELEVANT PREMISE" label to it. To understand some type irrelevance errors which can occur, please refer to provided knowledge base. While finding errors in provided reasoning chains, you can refer to gold-standard reasoning chain for better understanding.

### Knowledge Base of the Errors:
{irr_knowledge_base}

### Trigger:
The user submits a legal context, question and options and reasoning chains 1 and 2. Reasoning Chain 1 is the gold-standard answer. Reasoning Chain 2 is the reasoning chain provided by a LLM to solve the question.

### Instructions:
Take your time to reiterate the clues given in the legal scenario. Take a deep breath and follow these IMPORTANT POINTS given below:
1. Keep in mind that in the reasoning chain, only final statement with the final answer is the conclusion. All preceding statements are premises.
2. You have to find the errors ONLY in the premises, NOT in the conclusion.
3. Take your time to go through ALL the type of irrelevant premises given in the knowledge base above. Assign the explanation which matches the most with one of them. If none match, categorize it as OTHER.
4. Carefully go through the legal context and gold standard reasoning chain before deciding whether some premise is an irrelevant premise or not.

### Structure of Response:
Suppose if there are 2 premises and 1 conclusion in provided reasoning chain, structure your response in the following way:
Premise 1: <Content of premise 1>
Relevance Check: [IRRELEVANT PREMISE ERROR/RELEVANT PREMISE]
Explanation: <Justify the relevance/irrelevance of the premise pertaining to the question>

Premise 2: <Content of premise 2>
Relevance Check: [IRRELEVANT PREMISE ERROR/RELEVANT PREMISE]
Explanation: <Justify the relevance/irrelevance of the premise pertaining to the question>

Conclusion: [Not checked for Irrelevant Premises]
Since Conclusion is not a premise, it is not checked for relevance.

### Annotative Examples:

Example 1:

Legal Context:
Congress has provided that a corporation is a citizen of the state in which it is incorporated and the state of its principal place of business. 28 U.S.C. §1332(c)(1). So, to apply the  Strawbridge  complete diversity rule to corporate parties to a diversity case, courts have to determine where a corporation’s principal place of business is. Corporations conduct their business in many different ways. Most corporations are small, and are incorporated in one state and do all their business in that state. These corporations are citizens of only one state. But large corporations might have extensive activities in many states—think Home Depot, or McDonalds. And some corporations will incorporate in one state, concentrate their manufacturing activities in another, and maintain their headquarters, where the corporation’s high-level decisions are made, in a third. Until 2010, federal courts took several approaches to determining a corporation’s principal place of business under 28 U.S.C. §1332(c)(1). Some held that the state of a corporation’s headquarters is the state of its principal place of business, even if the corporation’s manufacturing or service activities took place elsewhere. Other courts, however, applied a ‘‘daily activities’’ test if a corporation’s productive activities were primarily concentrated in a single state, reasoning that the corporation will be perceived as ‘‘local’’ in that state, since it will have the most contact with the public there. If the productive activities were widely spread among states, these courts would look to the state where the corporation has its headquarters or ‘‘nerve center,’’ since the diffused activities would all be coordinated from that state. All of this is now interesting history, because in 2010 the Supreme Court held, in  Hertz Corp. v. Friend,  559 U.S. 77 (2010), that the corporation’s principal place of business for diversity purposes is the state of its headquarters in all cases. After  Hertz , a corporation is citizen of the state in which its headquarters are located, even if all of its actual manufacturing or service activities take place elsewhere. The Court emphasized the importance of having a relatively clear rule that will avoid uncertainty and litigation about whether diversity jurisdiction is met in cases involving corporate parties.

Question:
Angus and Phillips, from Texas, bring a breach of contract action against Apex Corporation in the federal district court for the Western District of Texas. Apex is a corporation that manufactures lawnmowers. It is incorporated in Delaware. It assembles the mowers at its plant in El Paso, Texas, which employs five hundred employees. It has another factory in Tennessee, which manufactures handles for the mowers and employs twenty-five employees. Its corporate headquarters occupy a small suite of offices on the twelfth floor of an office building in Tulsa, Oklahoma. Fifteen officers and employees work out of the headquarters office. The court will find that there

Options:
Options: A. is complete diversity and proceed with the case.
B.  is not complete diversity because the case is brought in a Texas court, and the plaintiffs are both Texas citizens.
C.  is not complete diversity because both plaintiffs are from Texas.
D.  is not complete diversity between the plaintiffs and Apex. Under 28 U.S.C. §1332(c)(1) Texas is Apex’s principal place of business, because Texas citizens in El Paso will think of Apex as ‘‘local,’’ while hardly anyone in Oklahoma will know anything about Apex Corporation, which has very low visibility in Oklahoma.
E.  is diversity between the plaintiffs and Apex, because Apex is a citizen of Delaware based on its incorporation there.

Reasoning Chain Analysis:
Premise 1: The legal context explains that to determine a corporation's principal place of business for diversity jurisdiction purposes, courts used to apply different tests, such as the state of headquarters or the state of "daily activities."
Relevance Check: IRRELEVANT PREMISE
Explaination: This information is correct but not relevant for answering the quesiton as these are methods that were used in the past and are not useful for a modern case.

Premise 2: However, in Hertz Corp. v. Friend (2010), the Supreme Court held that a corporation's principal place of business is its state of headquarters in all cases.
Relevance Check: RELEVANT PREMISE
Explanation: This correctly restates information from the legal context that is relevant to answering the question.

Premise 3: Based on the provided information, Apex Corporation's headquarters are located in Tulsa, Oklahoma.
Relevance Check: RELEVANT PREMISE
Explanation: This premise is correct and relevant to answering the question.

Premise 4: Therefore, under the current legal rule established by Hertz, Apex Corporation is a citizen of Oklahoma.
Relevance Check: RELEVANT PREMISE
Explanation: This premise correctly states that Apex Corporation is a citizen of Oklahoma according to the legal context. Apex Corporation is also a citizen of Delaware due to being incorporated there. This is mentioned in the gold standard reasoning chain: “A corporation is a citizen of  both  its state of incorporation and the state of its principal place of business.” Despite being incomplete, this premise is still relevant to answering the question.

Conclusion: [Not checked for Irrelevant Premises]
D. is not diversity between the plaintiffs and Apex.
"""
    input_prompt = "Legal Context:\n" + legal_context_question_options
    input_prompt = input_prompt + f"\nReasoning Chain 1:\n{row['Analysis']}\nReasoning Chain 2:\n{llm_rc}"
    relevance_check = get_text_completion_from_GPT(irrelevant_premise_system_prompt, input_prompt)
    relevance_check_summary = error_summarizer(relevance_check)
    return relevance_check, relevance_check_summary

### Error Summarization and Aggregation

In [12]:
def error_summarizer(list_of_errors):
    error_summarizer_system_prompt = f"""You are an expert in creating a summary of errors present in a list of analyzed premises/conclusion. The summary should contain the description of the errors present in premises/conclusions.

### IMPORTANT INSTRUCTIONS:
1. List all errors in the premises/conclusion in a summarized manner.
2. Do not summarize a premise/conclusion if it does not have an error.

### List of Errors:
{list_of_errors}

### Your summarization:
[Summarization of errors for each premise or conclusion in point-wise format]
"""
    input_prompt = f""
    error_summarizer_results = get_text_completion_from_GPT(error_summarizer_system_prompt, input_prompt)
    return error_summarizer_results

def error_aggegrator(simple_error_check_summary, factuality_check_summary, misinterpretation_check_summary, relevance_check_summary):
    error_aggegrator_system_prompt = f"""You are an expert in combining summaries into a single unified summary. Each summary contains description of errors present in premises/conclusions. Your task is to combine all them into one unified summary of errors. If there are premises/conclusions which overlap in multiple summaries, please write them in a single statement point by combining them.

### IMPORTANT INSTRUCTIONS:
1. List all errors in the premises/conclusion in a summarized manner.
2. DO NOT summarize a premise/conclusion if it does not have an error.

### Summary of Error Detector 1:
{simple_error_check_summary}

### Summary of Error Detector 2:
{misinterpretation_check_summary}

### Summary of Error Detector 3:
{factuality_check_summary}

### Summary of Error Detector 4:
{relevance_check_summary}

### Your combined summarization:
"""
    input_prompt = f""
    error_aggegrator_results = get_text_completion_from_GPT(error_aggegrator_system_prompt, input_prompt)
    return error_aggegrator_results

## Conclusion-Level

### Correct Option Extraction

In [13]:
def correct_option_extractor(legal_context_question_options, gold_standard_answer):
    correct_option_extractor_system_prompt = f"""You are an expert in extracting the final option which is the correct answer to the given legal scenario below. You do not have to reason and answer the question by yourself. You are provided with expert answer to the given legal scenario and you have to extract which option the expert has chosen. You have to write down the exact option verbatim from the given legal scenario.

### The Legal Scenario with Context, Question and Options:
{legal_context_question_options}

### The Expert Answer to the Legal Scenario:
{gold_standard_answer}

### Structure of Response:
[The option chosen by the expert verbatim as exactly enumerated in the given options]
"""
    input_prompt = ""
    correct_option_extractor = get_text_completion_from_GPT(correct_option_extractor_system_prompt, input_prompt)
    return correct_option_extractor

### Reasoning Chain Conclusion Extraction and Conclusion Level Error Classification

In [14]:
def conclusion_extractor(llm_rc):
    conclusion_extractor_system_prompt = f"""You are an expert in extracting the conclusion of the reasoning chain provided to you. The reasoning chain consists of a set of statements (premises) and a conclusion which chooses a final option as the answer chosen for a legal scenario. The conclusion may contain the final option within itself or consequently just after it. You have to diligently extract the conclusion and the final chosen option from the reasoning-chain.  

### The provided Reasoning-Chain:
{llm_rc}

### IMPORTANT INSTRUCTIONS:
1. Remember, the conclusion is usually the last step/statement in the reasoning chain. It can be the penutimate step as well.
2. The extracted Conclusion MUST contain the final option chosen as the answer in the reasoning chain.  

### Structure of Response:
[The Conclusion along with the final option chosen as the answer]
"""
    input_prompt = ""
    conclusion_extractor = get_text_completion_from_GPT(conclusion_extractor_system_prompt, input_prompt)
    return conclusion_extractor

def conclusion_error_checking(error_aggegrator_results, conclusion_extracted, expert_option_chosen):
    conclusion_error_checking_system_prompt = f"""You are an expert error classifier. You are provided with the summary of errors in a reasoning chain at the premise level, the extracted conclusion (with the final-option as the answer to a legal scenario) and the actual ground truth option chosen by an expert. You task is to assign an error (if any) to the extracted conclusion based on the following conditions:
1. If the option chosen in the extracted conclusion and ground-truth option match, but their content does not match: HALLUCINATION
2. If there are no errors in the premises and the options in the extracted conclusion and ground-truth option match: CORRECT CONCLUSION
3. If there are no errors in the premises but the options in the extracted conclusion and ground-truth option do not match: WRONG CONCLUSION FROM INCOMPLETE PREMISES
4. If there are errors in the premises and the options in the extracted conclusion and ground-truth option do not match: WRONG CONCLUSION FROM FALSE PREMISES
5. If there are errors in the premises but the options in the extracted conclusion and ground-truth option do match: CORRECT CONCLUSION FROM FALSE PREMISES

# VERY IMPORTANT: If the options numbers match (for example both predicted and ground-truth conclusions are both 'A') AND If the content of the option chosen in the extracted conclusion and ground-truth option DO NOT match: on the highest priority assign it as HALLUCINATION. 

The Summary of Errors in the Premises:
{error_aggegrator_results}

The Extracted Conclusion (to be checked against the ground truth option):
{conclusion_extracted}

The Ground Truth Option:
{expert_option_chosen}
"""
    input_prompt = ""
    conclusion_error_check = get_text_completion_from_GPT(conclusion_error_checking_system_prompt, input_prompt)
    return conclusion_error_check

#### For Output Results for any LLM 'X'

In [15]:
output_file = "" # Output Path where to store the results
output_df = pd.read_csv(output_file)
### Data-Processing
for idx, row in df.iterrows():
    # Extract legal_context
    legal_context_question_options = f"Legal Context:\n{row['Context']}\n\nQuestion:\n{row['Question']}\nOptions:\n{row['Options']}"
    # Get LLM X's Reasoning Chain
    llm_rc = row['Response(X)']
    print(f"The LLM Reasoning Chain is:\n {llm_rc}\n")
    # Get the expert author's answer
    gold_standard_answer = row['Analysis'] 
    print(f"FOR SAMPLE NUMBER: {idx+1}")
    print("VALIDATION QUESTIONS GENERATION FOR PREMISES AND CONCLUSION:")
    # Validation Questions Generation
    rc_with_validation_questions = validation_question_generation(llm_rc)
    print(rc_with_validation_questions)
    print("")
    print("===================")
    print("ERROR-DETECTOR 1 (ERROR CHECKER):")
    print("===================")
    simple_error_check_results, simple_error_check_results_summary = simple_err_checker(row, legal_context_question_options, llm_rc)
    print(simple_error_check_results_summary)
    print("")
    print("===================")
    print("ERROR-DETECTOR 2 (MISINTERPRETATION ERRORS):")
    print("===================")
    # Misinterpretation-Checking
    misinterpretation_check_results, misinterpretation_check_results_summary = misinterpretation_checker(row, legal_context_question_options, llm_rc)
    print(misinterpretation_check_results_summary)
    print("")
    print("===================")
    print("ERROR-DETECTOR 3 (FACT-CHECKING):")
    print("===================")
    # Fact-Checking
    fact_check_results, fact_check_results_summary = fact_checker(rc_with_validation_questions, legal_context_question_options)
    print(fact_check_results_summary)
    print("")
    print("===================")
    print("ERROR-DETECTOR 4 (RELEVANCE-CHECKING):")
    print("===================")
    # Relevance-Checking
    relevance_check_results, relevance_check_results_summary = irrelevant_premise_checker(row, legal_context_question_options, llm_rc)
    print(relevance_check_results_summary)
    print("")
    print("===================")
    print("ERROR AGGREGATION AT PREMISE LEVEL:")
    print("===================")
    # Error-Aggregation at Premise level
    error_aggegrator_results = error_aggegrator(simple_error_check_results_summary, fact_check_results_summary, misinterpretation_check_results_summary, relevance_check_results_summary)
    print(error_aggegrator_results)
    print("")
    print("===================")
    print("FINAL OPTION CHOSEN BY THE EXPERT:")
    print("===================")
    # Final Correct Option as per the expert
    expert_option_chosen = correct_option_extractor(legal_context_question_options, gold_standard_answer)
    print(expert_option_chosen)
    print("")
    print("===================")
    print("CONCLUSION EXTRACTED FROM THE REASONING CHAIN:")
    print("===================")
    # Conclusion Extracted from the reasoning chain
    conclusion_extracted = conclusion_extractor(row)
    print(conclusion_extracted)
    print("")
    print("===================")
    print("CONCLUSION ERROR CLASSIFICATION:")
    print("===================")
    # Error assigned at the conclusion level
    conclusion_error_check = conclusion_error_checking(error_aggegrator_results, conclusion_extracted, expert_option_chosen)
    print(conclusion_error_check)
    print("="*124)
    consolidated_text = f"""FOR SAMPLE NUMBER: {idx+1}:
VALIDATION QUESTIONS GENERATION FOR PREMISES AND CONCLUSION:
{rc_with_validation_questions}

ERROR-DETECTOR 1 (ERROR CHECKER):
{simple_error_check_results_summary}

ERROR-DETECTOR 2 (MISINTERPRETATION ERRORS):
{misinterpretation_check_results_summary}

ERROR-DETECTOR 3 (FACT-CHECKING):
{fact_check_results_summary}

ERROR-DETECTOR 4 (RELEVANCE-CHECKING):
{relevance_check_results_summary}

ERROR AGGREGATION AT PREMISE LEVEL:
{error_aggegrator_results}

FINAL OPTION CHOSEN BY THE EXPERT:
{expert_option_chosen}

CONCLUSION EXTRACTED FROM THE REASONING CHAIN:
{conclusion_extracted}

ERROR AT CONCLUSION LEVEL:
{conclusion_error_check}
"""
    output_df.at[idx,'Consolidated Error Analysis'] = consolidated_text
    output_df.to_csv(output_file, index=False)
    errors_prem_concl = f"""ERROR AGGREGATION AT PREMISE LEVEL:
{error_aggegrator_results}

ERROR AT CONCLUSION LEVEL:
{conclusion_error_check}
"""
    output_df.at[idx,'Premise and Conclusion Errors'] = errors_prem_concl
    output_df.to_csv(output_file, index=False)  