# Import LangChain to use Ollama API

In [1]:
# install package
!pip install langchain
!pip install langchain-ollama




# Import data from json file

In [1]:
import json
import pandas as pd

# Specify the path to your JSON file
file_path = 'legal_reasoning_30_sample.json'

# Open and read the JSON file
with open(file_path, 'r') as file:
    data = json.load(file)

# Print the data to verify
raw_data = data["legal_scenarios"]

df = pd.DataFrame(raw_data)

df

Unnamed: 0,id,context,question,options,ground_truth
0,1,Under federal diversity jurisdiction a citizen...,"Marla, who grew up in Montana, moved to Colora...","A. remains domiciled in Montana, because the p...",Don’t be fooled by A. Although Marla’s program...
1,2,The most fundamental point about arising-under...,"Consolidated Corporation sues Garces, a former...","A. granted, because the court lacks subject ma...",While it might make sense to allow federal jur...
2,3,The basic standard for removal jurisdiction is...,"Castor Chemical Company, a California company,...","A. The action is properly removed, because at ...","The third choice, C, is particularly illogical..."
3,4,"The early law of personal jurisdiction, repres...","Neff hired Mitchell, a lawyer, in Oregon to su...",A. has personal jurisdiction over Neff. B. doe...,This question makes the basic point about Penn...
4,5,"The previous chapter iterates, and reiterates,...",In which of the following cases could the cour...,A. Johnston brings an action in New Mexico aga...,"In A, jurisdiction is not based on general in ..."
5,6,Let’s start with the role of the United States...,"Perrone, an Acadia resident, sues Margules, a ...",A. The Acadia statute authorizes the court to ...,It shouldn’t be too hard to narrow down the ch...
6,7,The natural inclination of the defendant who t...,"Ito, a stockbroker, lives and works in Massach...",A. hear and decide the jurisdictional objectio...,"By the way, here’s a quick point on exam-takin..."
7,8,The Due Process Clause of the Fourteenth Amend...,In Mullane v. Central Hanover Bank & Trust Co....,A. every person whose interests may be affecte...,A takes the position that everyone who might b...
8,9,Venue in most federal actions is governed by 2...,"Dziezek, who resides in the Southern District ...",A. the Western District of Kentucky. B. the So...,"Let’s see. Under §1391(b)(1), venue is proper ..."
9,10,The First Congress recognized and addressed th...,Thomas is injured while walking along the rail...,"A. The Rules of Decision Act, as interpreted b...","Actually, none of the above is true, except D...."


In [2]:
import pandas as pd

df = pd.read_csv("LR-final-dataset.csv", encoding="unicode_escape")
df

Unnamed: 0,Context,Question,Options,Analysis,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,State citizenship in diversity cases: The domi...,"1. Moving Marla. Marla, who grew up in Montan...","Options: A. remains domiciled in Montana, bec...",Dont be fooled by A . Although Marlas prog...,,,,,,
1,The difference between intent and evidence: Pr...,"2. Wedding plans. Rossi grew up in Erie, Penn...","Options: A. Pennsylvania, since she is only i...","Lets start with C , which suggests that the...",,,,,,
2,Chief Justice Marshalls Strawbridge Rule: T...,3. Foundering fathers. In which of the follow...,"Options: A. Madison, from Virginia, Jefferson...","Questions with double choices, like D here,...",,,,,,
3,Determining a corporations principal place of...,"5. Business principals. Angus and Phillips, f...",Options: A. is complete diversity and proceed...,The wrong answers in a multiple choice questi...,,,,,,
4,Diversity in cases involving foreign citizens ...,"6. At home and abroad. Crandall, a citizen of...",Options: A. There is no jurisdiction over the...,This question should help to sort out the com...,,,,,,
...,...,...,...,...,...,...,...,...,...,...
170,Psychoanalyzing old lawsuits: The actually d...,5. Reprise. Watkins and Pasquale are in an ac...,Options: A. Watkins can invoke collateral est...,"When Watkins wins the first action, it is p...",,,,,,
171,The necessary to the judgment requirement ...,8. Here we go again. Arrowmark Products Compa...,Options: A. Arrowmark can invoke collateral e...,D as in Dog once more. Sentrys defense in th...,,,,,,
172,The necessary to the judgment requirement ...,9. Triple play. Perez sues the City of Atlant...,Options: A. Perez may not estop the City on t...,"Not to belabor the point, but again the analy...",,,,,,
173,The cases weve analyzed so far have involved ...,10. First Principles. Gotchall invested one m...,"Options: A. is barred, because he should have...",Scratch C from the beginning. The finding t...,,,,,,


# Usage

In [6]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import re


def create_prompt(index):
    context = df["Context"][index]
    question = df["Question"][index]
    option = df["Options"][index]
    ground_truth = df["Analysis"][index]

    template = """Question: {question}

    Answer: Let's think step by step."""

    prompt = ChatPromptTemplate.from_template(template)

    model = OllamaLLM(model="llama3.2:3b-instruct-q5_1")

    chain = prompt | model

    human_prompt = """please generate a useful step by step reasoning prompt based on the ground truth that can help other LLM easily reach the correct answer, 
                    include those component below, Topic, rule, explanation, analysis, counterarguments, and always give a conclusion of the context. 
                    Be careful with the answer you generate.
                    Do not give me the correct answer, just the compoents.\
                    """
    user_prompt = context + question + option + ground_truth + human_prompt


    results = chain.invoke({"question": user_prompt})

    correct = chain.invoke({"question": ground_truth + "Give me the correct option based on the ground truth using the format <The ground truth is: <the correct option>>"}) 
    #print(f"The correct answer of {index + 1} question is:", correct)
    correct_answer = re.search(r"The ground truth is:\s*(\w)", correct)
    
    if correct_answer:
        correct_option = correct_answer.group(1)
        print(f"The ground truth of question {index+1} is: {correct_option}")
    else:
        print("No option found.")
        correct_option = "null"

    return [results, context, question, option, correct_option]

# Evaluate results 

In [7]:
import re

def eval_data_sample(index, results, context, question, option):

    template = """Question: {question}

    Answer: Let's think step by step.
    """

    prompt = ChatPromptTemplate.from_template(template)

    model = OllamaLLM(model="llama3.2:3b-instruct-q5_1")

    chain = prompt | model

    eval_prompt = context + question + option + results

    eval_results = chain.invoke({"question": eval_prompt + "reasoning through all the steps carefully, give me your step by step reasoning and clarify all of them by yourself. \
                                 Also give me the answer with using this format: <The correct answer is: <the correct option>> , do not use any other format"})

    # match predicted answer
    eval_match = re.search(r"The correct answer is:\s*(\w)", eval_results)
    if eval_match:
        eval_option = eval_match.group(1)
        print(f"The answer of question {index+1} is: {eval_option}")
    else:
        print("No option found.")
        eval_option = "null"

    return eval_results, eval_option

# Full Dataset Testing

In [72]:
from tqdm import tqdm
import json

final_list = []
for i in tqdm(range(0, 175), desc="Processing", unit="iteration"):

    created_prompt_data = create_prompt(i)
    eval_data = eval_data_sample(i, 
                                created_prompt_data[0], 
                                created_prompt_data[1], 
                                created_prompt_data[2], 
                                created_prompt_data[3], 
                                )
    
    results_dict = {"data sample": i+1, 
                    "prompt": created_prompt_data[0],
                    "reasoning step": eval_data[0],
                    "answer after reasoning": eval_data[1],
                    "ground truth": created_prompt_data[4]}

    
    final_list.append(results_dict)

# store data in json file|
file_name = "eval_data_sample.json"

with open(file_name, "w") as json_file:
    json.dump(final_list, json_file, indent=4)  



Processing:   0%|          | 0/7 [00:00<?, ?iteration/s]

The ground truth of question 89 is: B


Processing:  14%|█▍        | 1/7 [00:12<01:15, 12.51s/iteration]

The answer of question 89 is: D
The ground truth of question 90 is: D


Processing:  29%|██▊       | 2/7 [00:28<01:11, 14.30s/iteration]

The answer of question 90 is: D
The ground truth of question 91 is: A


Processing:  43%|████▎     | 3/7 [00:42<00:58, 14.59s/iteration]

No option found.
The ground truth of question 92 is: D


Processing:  57%|█████▋    | 4/7 [00:57<00:43, 14.45s/iteration]

No option found.
The ground truth of question 93 is: D


Processing:  71%|███████▏  | 5/7 [01:09<00:27, 13.84s/iteration]

The answer of question 93 is: D
The ground truth of question 94 is: B


Processing:  86%|████████▌ | 6/7 [01:26<00:14, 14.60s/iteration]

The answer of question 94 is: A
The ground truth of question 95 is: C


Processing: 100%|██████████| 7/7 [01:43<00:00, 14.72s/iteration]

The answer of question 95 is: E





# Result of Llama3.2 3B Instruct

In [None]:
full_data = pd.read_json("llama3.2-3b-instruct_eval_data_sample.json")

In [103]:
correct_answer = 0
for i in range(len(full_data["data sample"])):
    if full_data["answer after reasoning"][i] == full_data["ground truth"][i]:
        correct_answer += 1
    else:
        continue

print("Accuracy: ", end="")
p = correct_answer/len(full_data["data sample"]) * 100
print(float(f'{p:.2f}'), "%")
print("Number of currect answer: ", correct_answer, "/ 175")


Accuracy: 66.86 %
Number of currect answer:  117 / 175


# Result of Llama3.1 8B Instruct

In [104]:
full_data = pd.read_json("Chao-Shiang_full_data_sample_eval_Llama3.1-8B-instruct.json")

In [105]:
correct_answer = 0
for i in range(len(full_data["data sample"])):
    if full_data["answer after reasoning"][i] == full_data["ground truth"][i]:
        correct_answer += 1
    else:
        continue

print("Accuracy: ", end="")
p = correct_answer/len(full_data["data sample"]) * 100
print(float(f'{p:.2f}'), "%")
print("Number of currect answer: ", correct_answer, "/ 175")

Accuracy: 54.86 %
Number of currect answer:  96 / 175


# Use Gemini 1.5 flash

In [95]:
import google.generativeai as genai


def get_component_from_gemini(index, df):

    context = df["Context"][index]
    question = df["Question"][index]
    option = df["Options"][index]
    ground_truth = df["Analysis"][index]

    genai.configure(api_key="AIzaSyDOX7cD5yV0flXM1Mbdw6uAgwkiPKrXBzg")
    model = genai.GenerativeModel("gemini-1.5-flash")
    components = model.generate_content(context + question + option + ground_truth + """please generate a useful step by step reasoning prompt based on the ground truth that can help other LLM easily reach the correct answer, 
                        include those component below, Topic, rule, explanation, analysis, counterarguments, and always give a conclusion of the context. 
                        Be careful with the answer you generate.
                        Do not give me the correct answer, just the compoents.\
                        """)

    

    ground_truth = model.generate_content(ground_truth + "Give me the correct option based on the ground truth using the format <The ground truth is: <the correct option>>")
    print(ground_truth.text)
    correct_answer = re.search(r"The ground truth is:\s*(\w)", ground_truth.text)
    
    if correct_answer:
        correct_option = correct_answer.group(1)
        print(f"The ground truth of question {index+1} is: {correct_option}")
    else:
        print("No option found.")
        correct_option = "null"

    return [components.text, correct_option]


  from .autonotebook import tqdm as notebook_tqdm


In [96]:
def get_eval_from_gemini(index, df, components):

    context = df["Context"][index]
    question = df["Question"][index]
    option = df["Options"][index]

    genai.configure(api_key="AIzaSyDOX7cD5yV0flXM1Mbdw6uAgwkiPKrXBzg")
    model = genai.GenerativeModel("gemini-1.5-flash")
    eval_ans = model.generate_content(context + question + option + components + "reasoning through all the steps carefully, give me your step by step reasoning and clarify all of them by yourself. \
                                 Also give me the answer with using this format: <The correct answer is: <the correct option>> , do not use any other format")
    
    # match predicted answer
    eval_match = re.search(r"The correct answer is:\s*(\w)", eval_ans.text)
    if eval_match:
        eval_option = eval_match.group(1)
        print(f"The answer of question {index+1} is: {eval_option}")
    else:
        print("No option found.")
        eval_option = "null"

    return eval_ans.text, eval_option

In [97]:
from tqdm import tqdm
import json

final_list = []
for i in tqdm(range(6, 7), desc="Processing", unit="iteration"):

    created_prompt_data = get_component_from_gemini(i, df)
    eval_data = get_eval_from_gemini(i, 
                                 df,
                                 created_prompt_data[0],  
                                )
    
    results_dict = {"data sample": i+1, 
                    "prompt": created_prompt_data[0],
                    "reasoning step": eval_data[0],
                    "answer after reasoning": eval_data[1],
                    "ground truth": created_prompt_data[1]}

    
    final_list.append(results_dict)

# store data in json file|
file_name = "eval_data_sample.json"

with open(file_name, "w") as json_file:
    json.dump(final_list, json_file, indent=4)  


Processing:   0%|          | 0/1 [00:00<?, ?iteration/s]

The provided text does not contain a multiple-choice question with options A, B, C, and D.  Therefore, it's impossible to give a correct option in the requested format.  The text only discusses the nature of different types of exam questions.

No option found.


Processing: 100%|██████████| 1/1 [00:17<00:00, 17.97s/iteration]

No option found.





In [101]:
full_data = pd.read_json("gemini_flash_eval_data_sample_1_174.json")

correct_answer = 0
for i in range(len(full_data["data sample"])):
    if full_data["answer after reasoning"][i] == full_data["ground truth"][i]:
        correct_answer += 1
    else:
        continue

print("Accuracy: ", end="")
p = correct_answer/len(full_data["data sample"]) * 100
print(float(f'{p:.2f}'), "%")
print("Number of currect answer: ", correct_answer, "/ 175")


Accuracy: 84.0 %
Number of currect answer:  147 / 175
