In [4]:
import sys
# Add the desired path to sys.path
sys.path.append('') # Append openai library path to sys.path
# print(sys.path)

In [5]:
import openai
from openai import OpenAI

In [6]:
import os
import dotenv
from dotenv import load_dotenv
import time
import openai

# Load environment variables from .env file
load_dotenv(dotenv_path='') # Add the path to the 'env' containing the relevant OPENAI API key here, preferably to use GPT-4o
# Now you can access the environment variable
api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key=api_key)

### Loading the 'Civ. Pro.' Dataset
The dataset should contain the columns for: 1. Legal Context('Context' here in a dataframe 'df') 2. Question ('Question') 3. Options ('Options'), 4. Expert Answers ('Analysis') and the LLM generated responses to be evaluated for errors (Which in our case, are named as 'Response(LLM X)' for any LLM X)

In [7]:
import pandas as pd
input_file = r"" # Add the csv containing the 'Civ. Pro' Dataset
df = pd.read_csv(input_file)
# df

In [8]:
import requests
# OpenAI API call handling for timout issues
def get_text_completion_from_GPT(system_prompt, prompt, model="gpt-4o", max_retries=3, retry_delay=2):
    for attempt in range(max_retries):
        try:
            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,  # Degree of randomness of the model's output
                frequency_penalty=0,
                presence_penalty=0,
                seed = 42
            )
            return response.choices[0].message.content
        except (requests.exceptions.Timeout, requests.exceptions.RequestException) as e:
            # print(f"Attempt {attempt + 1} failed. Retrying after {retry_delay} seconds.")
            time.sleep(retry_delay)
    # If all attempts fail, raise the last exception
    raise e

# Metric Calculations

## Step-Counter

#### Counting the steps in reasoning chains for LLM 'X'

In [9]:
def step_counter(llm_rc):
    step_counter_system_prompt = """You are an expert in counting the number of steps in a reasoning chain generated as an explanation to legal MCQ. You are provided with a reasoning chain which consists of premises and a conclusion.
Your task is to count the number of steps by counting all the premises and the conclusion.     

### Instructions:
1. Number of Steps = Number of Premises + Conclusion.
2. If a premise contains analysis of individual options, then count each of those anlayses as single options.
3. The final conclusion along with the subsequent choosing of an option is considered as a single step.

Provide the output in the following key-value format:
num_of_premises:
num_of_steps:

Example 1 of counting number of steps in a reasoning chain:
Reasoning Chain:
[1] The question asks about Marla's domicile based on her current residence in Denver, Colorado, and her intentions regarding her stay there.
[2] According to the legal context, a person's domicile is determined by the state where they reside with the intent to remain indefinitely.
[3] Marla moved to Denver, Colorado, to attend a two-year hair stylist program. She was unsure about her career choice and had plans to leave the program if she didn't like it. She also mentioned that she might look for work in Denver or other western states, including Montana.
[4] Marla's lease in Denver is for six months, which indicates that she does not have an open-ended intention to remain in Colorado indefinitely.
[5] The fact that Marla was domiciled in Montana before moving to Denver does not automatically make her domiciled in Montana again. Domicile is determined by the present intent to remain indefinitely in a state, not by past domicile.
[6] Based on the information provided, Marla does not meet the requirement of residing in Colorado with the intent to remain indefinitely.
[Final Answer] A. remains domiciled in Montana.

YOUR RESPONSE:
num_of_premises: 5 
num_of_steps: 6

Example 2 of counting number of steps in a reasoning chain:
1. **Understanding Rule 26(a)(1) Requirements**: Rule 26(a)(1) mandates the disclosure of information that a party may use to support its claims or defenses. This includes documents and other evidentiary material that are relevant to the case.
2. **Relevance of the Letter**: The letter from an individual claiming copyright over the story is directly relevant to Fremont Publishing Company's claim of copyright in the lawsuit against New Era Press. The letter challenges Fremont's assertion of copyright ownership, which is central to the case.
3. **Obligation to Disclose**: Under Rule 26(a)(1), Fremont should have disclosed the letter as it pertains to the computation of damages and the establishment of a claim (copyright ownership in this case). The letter is a piece of evidence that Fremont might use to defend against the claim of copyright if it impacts the validity of their ownership claim.
4. **Rule 26(e)(1) - Duty to Supplement**: This rule requires a party to supplement or correct its disclosure upon learning that the initial disclosure was either incomplete or incorrect. Fremont's failure to initially disclose the letter necessitates a need for supplemental disclosure.
5. **Analysis of Options**:
   - **Option A (Sanctions under Rule 37(c)(1))**: This option is typically considered when a party fails to disclose information that should have been disclosed under Rule 26(a). However, sanctions are generally applied for more egregious or harmful nondisclosures.
   - **Option B (Supplemental Disclosure)**: This is the most fitting response as it directly addresses the need for Fremont to correct its oversight or intentional nondisclosure by providing the letter.
   - **Option C (Rule 34 Request for Production)**: While this could theoretically obtain the letter, it is not the most direct or appropriate method given that the issue here is Fremont's failure to comply with mandatory disclosure rules.
   - **Option D (Cannot obtain the letter)**: This is incorrect because the letter is clearly relevant and should have been disclosed under Rule 26(a).

[Final Answer: Option B should move to require supplemental disclosure of the letter, under Rules 26(a)(1) and 26(e)(1). This option directly addresses the failure to disclose relevant information and seeks to rectify this by enforcing the rules designed to ensure fairness and transparency in the discovery process.]

YOUR RESPONSE:
num_of_premises: 8 <Since each option analysis is counted as one step>
num_of_steps: 9
"""
    input_prompt = f"""{llm_rc}"""
    extracted_text = get_text_completion_from_GPT(step_counter_system_prompt, input_prompt)
    return extracted_text

### Loading and saving data in an output file 'LLM X'
The path to an output file named 'LLM X' (where LLM X is to be substituted with the actual name of a LLM) is to be provided where the computed results are to be stored.

In [13]:
import re
output_path = r'' # Output Path where to store the Metrics computation results done in this notebook for LLM X
llm_x_output_file = output_path
if not os.path.exists(llm_x_output_file):
    # Create an empty file if it doesn't exist
    pd.DataFrame().to_csv(llm_x_output_file)

llm_x_output_df = pd.read_csv(llm_x_output_file)
for idx, row in df.iterrows():
  # Get LLM X's Reasoning Chain
    llm_rc = row['Response(LLM X)'] # Please subsititute 'LLM X' inside 'Response(LLM X)' with the appropriate LLM name
    print(f"FOR SAMPLE NUMBER: {idx+1}")
    print(f"The LLM Reasoning Chain is:\n {llm_rc}\n")
    # Calculate the number of steps in each reasoning chain
    counter_output = step_counter(llm_rc)
    print(counter_output)
    print("="*124)
    # Using regular expressions to extract the numbers
    num_of_steps = int(re.search(r'num_of_steps:\s*(\d+)', counter_output).group(1))
    llm_x_output_df.at[idx,'Counter Output'] = counter_output
    llm_x_output_df.at[idx,'Number of Steps'] = num_of_steps
    llm_x_output_df.to_csv(llm_x_output_file, index=False) 
    
# llm_x_output_df and llm_x_output_file will be used in the subsequent cells as well.

In [14]:
# Function to split 'Premise and Conclusion Errors' column into 'Premise Errors' and 'Conclusion Errors'
def extract_errors_split(text):
    # Splitting based on the keywords for premise errors
    premise_split = text.split('ERROR AGGREGATION AT PREMISE LEVEL:')
    if len(premise_split) > 1:
        premise_level_errors = premise_split[1].split('ERROR AT CONCLUSION LEVEL:')[0].strip()
    else:
        premise_level_errors = None

    # Splitting based on the keywords for conclusion errors
    conclusion_split = text.split('ERROR AT CONCLUSION LEVEL:')
    if len(conclusion_split) > 1:
        conclusion_level_errors = conclusion_split[1].strip()
    else:
        conclusion_level_errors = None

    return premise_level_errors, conclusion_level_errors

### Loading data from auto-eval output file for 'LLM X'
Load the file for LLM X which contains the column: 'Premise and Conclusion Errors' as processed in 'Legal-Reasoning-Auto-Evaluator-Pipeline' notebook. 

In [31]:
# Load the file for LLM X which contains the column: 'Premise and Conclusion Errors' as processed in 'Legal-Reasoning-Auto-Evaluator-Pipeline' notebook
import pandas as pd
llm_x_input_file = r"" # Add the file which contains the above column for any LLM X
llm_x_input_df = pd.read_csv(llm_x_input_file)
llm_x_output_df[['Premise Errors', 'Conclusion Errors']] = llm_x_input_df['Premise and Conclusion Errors'].apply(lambda x: pd.Series(extract_errors_split(x)))
llm_x_output_df.to_csv(llm_x_output_file, index=False)

## Premise-Error Label-Extractor

In [11]:
def label_extractor(prem_errors):
    label_extraction_system_prompt = """You are an expert in labelling the errors in the summary of premises and analyses presented to you. You have label them as either 'MISINTERPRETATION', 'FACTUAL HALLUCINATION' OR 'IRRELEVANT PREMISE'. If a premise (and its error explanation) contain mentions of multiple errors then all the appropriate labels to the premise.     

### Instructions:
1. A premise should .
2. If a premise contains analysis of individual options, then count each of those analyses as single steps.
3. For a permise, mention a label only ONCE, i.e. for example, do NOT label 'Premise 1: MISINTERPRETATION, MISINTERPRETATION'
4. If a point starts with the heading of 'Conclusion', do NOT label it.


Example 1 of labelling errors:
Provided Summary of Premise-Level Errors:

1. **Premise 4: Marla's Intent**
   - **Error**: Misinterprets Marla's intent.
   - **Explanation**: The premise incorrectly suggests ambiguity in Marla's intent. According to the legal context, Marla's plans are open-ended, meaning she does not have definite plans to leave Denver. The ambiguity mentioned does not align with the legal context's explanation of "indefinite intent."

YOUR RESPONSE:
Premise 4: MISNTERPRETATION

Example 2 of labelling errors:
Provided Summary of Premise-Level Errors:

1. **Premise 3:**
   - **Error:** Misinterpretation of the legal context.
   - **Description:** Incorrectly states that a case can be brought in federal court if the total amount in controversy exceeds the jurisdictional limit when suing multiple defendants. The legal context specifies that the plaintiff must seek more than $75,000 from each defendant individually, not in total.

2. **Premise 6:**
   - **Error:** Misinterpretation of the amount-in-controversy requirement.
   - **Description:** Incorrectly asserts that the case can be brought in federal court if the total amount in controversy exceeds the jurisdictional limit by aggregating claims from multiple plaintiffs. Each plaintiff must individually meet the $75,000 threshold.

3. **Premise 7:**
   - **Error:** Misinterpretation based on the previous error.
   - **Description:** The case cannot be brought in federal court because neither Larry nor Moe individually meets the $75,000 requirement.

4. **Premise 9:**
   - **Error:** Misinterpretation of the amount-in-controversy requirement.
   - **Description:** Incorrectly includes the counterclaim in the amount-in-controversy calculation. The amount-in-controversy requirement must be assessed based only on the plaintiffâ€™s claim, without regard to the value of any counterclaim.

5. **Premise 10:**
   - **Error:** Misinterpretation based on the previous error.
   - **Description:** The case cannot be brought in federal court because Larry's claim alone does not meet the $75,000 requirement.

6. **Premise 15:**
   - **Error:** Misinterpretation of the amount-in-controversy requirement.
   - **Description:** Incorrectly aggregates the claims against Curly and Dr. Moe. Each defendant must individually meet the $75,000 threshold.

7. **Premise 16:**
   - **Error:** Misinterpretation based on the previous error.
   - **Description:** The case cannot be brought in federal court because Dr. Moe's individual liability does not meet the $75,000 requirement.

8. **Option A Analysis:**
   - **Error:** Misinterpretation of the amount-in-controversy requirement.
   - **Description:** Incorrectly aggregates the claims of Larry and Moe. Each plaintiff's claim must individually meet the jurisdictional amount.

9. **Option B Analysis:**
   - **Error:** Misinterpretation of the amount-in-controversy requirement.
   - **Description:** Incorrectly includes the counterclaim in the amount-in-controversy calculation. The jurisdictional amount must be met by the plaintiff's claim alone.

10. **Option D Analysis:**
    - **Error:** Misinterpretation of the amount-in-controversy requirement.
    - **Description:** Incorrectly aggregates the claims against Curly and Dr. Moe. Each claim must individually meet the jurisdictional amount.

YOUR RESPONSE:
Premise 3: MISNTERPRETATION
Premise 6: MISNTERPRETATION
Premise 7: MISNTERPRETATION
Premise 9: MISNTERPRETATION
Premise 10: MISNTERPRETATION
Premise 15: MISNTERPRETATION
Premise 16: MISNTERPRETATION
Option A Analysis: MISNTERPRETATION
Option B Analysis: MISNTERPRETATION
Option D Analysis: MISNTERPRETATION

Example 3 of labelling errors:
Provided Summary of Premise-Level Errors:
1. **Premise 2:**
   - **Error Category:** Factual Inconsistency Hallucination and Misinterpretation
   - **Explanation:** Incorrectly states that all plaintiffs are from Virginia, whereas Gerry is actually from Massachusetts. This misinterpretation leads to an incorrect conclusion about diversity jurisdiction and misinterprets the Strawbridge Rule by suggesting diversity jurisdiction is proper despite the presence of a plaintiff (Gerry) from a different state (Massachusetts).

2. **Premise 3:**
   - **Error Category:** Misinterpretation and Irrelevant Premise
   - **Explanation:** Incorrectly states that Madison and Lafayette are from the same state, which is not true. Madison is from Virginia and Lafayette is from Maryland. Additionally, it misinterprets the legal context regarding corporate citizenship by incorrectly concluding that diversity jurisdiction is not proper. Washington Corporation is a citizen of Maryland, and Madison is from Virginia, making the premise irrelevant and incorrect.

3. **Premise 4:**
   - **Error Category:** Misinterpretation and Irrelevant Premise
   - **Explanation:** Incorrectly concludes that diversity jurisdiction is proper by failing to recognize that Adams Corporation is a citizen of both Delaware and Virginia. Since Madison is from Virginia and Adams Corporation has its principal place of business in Virginia, there is no diversity jurisdiction. This misinterpretation makes the premise irrelevant and incorrect.

YOUR RESPONSE:
Premise 2: FACTUAL HALLUCINATION, MISINTERPRETATION
Premise 3: MISNTERPRETATION, IRRELEVANT PREMISE
Premise 4: MISNTERPRETATION, IRRELEVANT PREMISE

Example 4 of labelling errors:
Provided Summary of Premise-Level Errors:
1. **Premise 1:**
   - **Error Category:** Irrelevant/Misinterpretation
   - **Explanation:** The premise incorrectly discusses Pennoyer v. Neff and the "presence theory," which is outdated and irrelevant to the modern context of minimum contacts jurisdiction. The focus should be on the concepts of foreseeability and purposeful availment.

2. **Premise 2:**
   - **Error Category:** Irrelevant
   - **Explanation:** The background information does not mention International Shoe v. Washington or the "minimum contacts" theory, nor does it explain the expansion of jurisdictional basis through this theory.

3. **Premise 9:**
   - **Error Category:** Misinterpretation
   - **Explanation:** This premise misinterprets the legal context by incorrectly concluding that physical presence in the state is necessary for establishing personal jurisdiction. Boyarin's deliberate contact with Mercy Hospital in Virginia to interfere with a contract is sufficient to establish purposeful availment, making Option B the correct answer.

4. **Conclusion:**
   - **Error Category:** Misinterpretation
   - **Explanation:** The conclusion is based on the misinterpretation in Premise 9, which incorrectly applies the legal principles of personal jurisdiction.

YOUR RESPONSE:
Premise 1: IRRELEVANT PREMISE, MISINTERPRETATION
Premise 2: IRRELEVANT PREMISE
Premise 9: MISINTERPRETATION
"""
    input_prompt = f"""{prem_errors}"""
    extracted_text = get_text_completion_from_GPT(label_extraction_system_prompt, input_prompt)
    return extracted_text

In [None]:
for idx, row in llm_x_output_df.iterrows():
    # Get LLM X's Premise-level errors
    prem_err = row['Premise Errors']
    print(f"FOR SAMPLE NUMBER: {idx+1}")
    print(f"The Premise Errors Summary is:\n {prem_err}\n")
    # Calculate the number of steps in each reasoning chain
    label_output = label_extractor(prem_err)
    print(label_output)
    print("="*124)
    llm_x_output_df.at[idx,'Premise Error Labels'] = label_output
    llm_x_output_df.to_csv(llm_x_output_file, index=False)

In [20]:
# Compute number of premise-level errors and conclusion-level errors for Soundness and Correctness Metric Calculation
llm_x_output_df['Number of Premise Errors'] = llm_x_output_df['Premise Error Labels'].apply(lambda x: str(x).count('Premise '))
llm_x_output_df['Soundness Score'] = (1 - llm_x_output_df['Number of Premise Errors'] / (llm_x_output_df['Number of Steps'] - 1))
llm_x_output_df['Correctness Score'] = llm_x_output_df['Conclusion Errors'].str.contains(r'\bCORRECT CONCLUSION\b(?!\s+FROM)', regex=True).astype(int)
llm_x_output_df['Correct Conclusion Score'] = llm_x_output_df['Conclusion Errors'].str.contains(r'CORRECT CONCLUSION').astype(int) # Represents the Accuracy Metric score
llm_x_output_df.to_csv(llm_x_output_file, index=False)

## Conclusion-Error Label-Extractor

In [None]:
# Checking for conclusion error frequencies
llm_x_output_df['Count(CORRECT CONCLUSION)'] = llm_x_output_df['Conclusion Errors'].str.contains(r'\bCORRECT CONCLUSION\b(?!\s+FROM)', regex=True).astype(int)
llm_x_output_df['Count(CORRECT CONCLUSION FROM FALSE PREMISES'] = llm_x_output_df['Conclusion Errors'].str.contains(r'CORRECT CONCLUSION FROM FALSE PREMISES').astype(int)
llm_x_output_df['Count(HALLUCINATION)'] = llm_x_output_df['Conclusion Errors'].str.contains(r'HALLUCINATION').astype(int) # Represents the 'Correct Conclusion with Hallucinated Content' error category
llm_x_output_df['Count(WRONG CONCLUSION FROM FALSE PREMISES)'] = llm_x_output_df['Conclusion Errors'].str.contains(r'WRONG CONCLUSION FROM FALSE PREMISES').astype(int)
llm_x_output_df['Count(WRONG CONCLUSION FROM INCOMPLETE PREMISES)'] = llm_x_output_df['Conclusion Errors'].str.contains(r'WRONG CONCLUSION FROM INCOMPLETE PREMISES').astype(int)
llm_x_output_df.to_csv(llm_x_output_file, index=False)
# View the DataFrame to ensure the counts are correct
llm_x_output_file.head()