# Imports

In [1]:
import datasets
from bs4 import BeautifulSoup
import pandas as pd
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# import torch
import re
from evaluation import evaluate
import google.generativeai as genai
import nltk
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


# Constants

In [2]:
PATH_TO_OWL = './LMSS.owl'
# LLM_PATH = '../Llama-2-7b-chat-hf'
GOOGLE_API_KEY = "AIzaSyCFCPqu6Ql9HW3PzORhkGcdsZcrRn5_Yuw"

# Set up the API

In [3]:
genai.configure(api_key=GOOGLE_API_KEY)

In [4]:
model = genai.GenerativeModel('gemini-pro')

In [5]:
response = model.generate_content("What is the meaning of life?")

In [6]:
print(response.text)

The meaning of life is a deeply personal and philosophical question that has been pondered by humans for centuries. There is no single, definitive answer that is universally accepted, as the meaning of life can vary depending on individual beliefs, values, and experiences.

Some common perspectives on the meaning of life include:

* **Finding purpose and fulfillment:** Many people find meaning in their lives by pursuing their passions, hobbies, and goals. This could involve engaging in creative activities, contributing to society, or simply living in accordance with their values.
* **Experiencing joy and happiness:** Others believe that the meaning of life is to simply enjoy the experiences and moments that it offers. This could involve spending time with loved ones, pursuing pleasurable activities, or appreciating the beauty of nature.
* **Making a difference:** Some people find meaning in their lives by working to improve the world or help others. This could involve volunteering, adv

In [7]:
def get_api_response(prompt):
    response = model.generate_content(prompt)
    return response.text

# Reading OWL

In [8]:
# Read the contents of the .owl file
with open(PATH_TO_OWL, "r") as owl_file:
    owl_data = owl_file.read()

# Parse the OWL data using BeautifulSoup
soup = BeautifulSoup(owl_data, 'xml')

In [9]:
# Initialize lists to store data
labels = []
definitions = []

# Find all instances of <owl:Class> elements and extract label and definition
for owl_class in soup.find_all('owl:Class'):
    label_element = owl_class.find('rdfs:label')
    definition_element = owl_class.find('skos:definition')
    
    # Check if label and definition elements exist
    if label_element and definition_element:
        label = label_element.text.strip()
        definition = definition_element.text.strip()
        
        # Append data to lists
        labels.append(label)
        definitions.append(definition)

data = {'Label': labels, 'Definition': definitions}
owl_df = pd.DataFrame(data)

owl_df

Unnamed: 0,Label,Definition
0,Other Personal and Household Goods Repair and ...,See industry description for 811490.
1,Other Converted Paper Product Manufacturing,This industry comprises establishments primari...
2,General Medical and Surgical Hospitals,
3,Confectionery Merchant Wholesalers,This industry comprises establishments primari...
4,Other Specialized Design Services,See industry description for 541490.
...,...,...
14248,Vocational Rehabilitation Services,
14249,Books Printing,This U.S. industry comprises establishments pr...
14250,Petrochemical Manufacturing,See industry description for 325110.
14251,Pesticide and Other Agricultural Chemical Manu...,This industry comprises establishments primari...


### Function to get classes

In [10]:
import random


def filter_label_by_substring(df, substring):
    """
    Filter DataFrame rows containing the specified substring in the 'Label' column
    and return a list of strings in the format "{Label} : {Definition}".
    
    Args:
        df (pandas.DataFrame): Input DataFrame.
        substring (str): Substring to search for.
        
    Returns:
        list: List of strings in the format "{Label} : {Definition}" for matching rows.
    """
    filtered_df = df[df['Label'].str.contains(substring, case=False)]
    output_list = []
    
    if len(filtered_df) <= 3:
        for index, row in filtered_df.iterrows():
            output_list.append(f"{row['Label']} : {row['Definition']}")
    else:
        selected_indices = random.sample(range(len(filtered_df)), 3)
        for idx in selected_indices:
            row = filtered_df.iloc[idx]
            output_list.append(f"{row['Label']} : {row['Definition']}")
    
    return output_list

In [40]:
#example usage
search_substring = 'plaintiff'
result = filter_label_by_substring(owl_df,search_substring)
print(result)
print(len(result))

["Plaintiff Would Have Consented : Defendant is not liable where defendant proves plaintiff would have consented, even though a reasonable person in plaintiff's position might not have consented to the procedure had he been given enough information about its risks.", 'Executor Plaintiff : An executor of an estate who is also the plaintiff in a legal matter.', 'Comparative Fault of Plaintiff : The concept of "Comparative Fault of Plaintiff" refers to the legal principle that allows a plaintiff\'s recovery to be reduced in proportion to their own degree of fault or negligence in causing their own injuries, as compared to the fault of the defendant.']
3


In [11]:
def filter_words_by_substring(words, df):
    """
    Filter DataFrame rows for each word in the list of words and append the results in a final list.
    
    Args:
        words (list): List of words.
        df (pandas.DataFrame): Input DataFrame.
        
    Returns:
        list: List of strings in the format "{Label} : {Definition}" for matching rows for all words.
    """
    final_output = []
    for word in words:
        output_list = filter_label_by_substring(df, word)
        final_output.extend(output_list)
    return final_output

In [12]:
def remove_stopwords(text, language='english'):
    # Get the stopwords for the specified language
    stopwords_list = set(stopwords.words(language))
    
    # Split the text into words
    words = text.split()
    
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stopwords_list]
    
    return filtered_words

In [13]:
#example usage
search_substring = 'On the issue of whether Jennifer suffered reputational harm from Lindas article, the fact that Linda worked with several different editors to proof read and cross-check her article.'
filtered_words = remove_stopwords(search_substring)
result = filter_words_by_substring(filtered_words,owl_df)
result.append(filter_label_by_substring(owl_df,'hearsay'))
print(result)
print(len(result))

['B240 Tax Issues : Analyses and advice regarding tax-related issues, including the preservation of net operating loss carry forwards', 'Motion for New Trial - Evidentiary Issues : A Motion for New Trial - Evidentiary Issues is a request made to a court to reconsider a previous decision based on issues related to the admissibility or sufficiency of evidence presented during the trial.', 'JE00 Issue / Statements of Case (UK J-CODE) : Covers issue and acknowledgment of proceedings, Statements of Case and Further Information requests/responses.  Includes taking instructions, making inquiries and searches, researching, drafting, editing, filing and all meetings and communications for the purpose of such documents.', "Reputational Risk : The threats and opportunities associated with an organization's reputation and credibility with its partners, stakeholders, and the public — that might damage an organization's public image, credibility, or brand value. These risks can stem from various sou

# Dataset

In [14]:
dataset_hearsay = datasets.load_dataset("nguha/legalbench", "hearsay")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [15]:
test_df = dataset_hearsay['test'].to_pandas()
test_df

Unnamed: 0,answer,index,slice,text
0,No,0,Non-assertive conduct,On the issue of whether James is an smart indi...
1,No,1,Non-assertive conduct,On the issue of whether Robert negligently dro...
2,No,2,Non-assertive conduct,On the issue of whether John knew about the co...
3,No,3,Non-assertive conduct,On the issue of whether Michael was guilty of ...
4,No,4,Non-assertive conduct,On the issue of whether William was loved by h...
...,...,...,...,...
89,No,89,Not introduced to prove truth,To prove that Arthur believed that Amy and Dan...
90,No,90,Not introduced to prove truth,To prove that the trademarks of restaurant A a...
91,No,91,Not introduced to prove truth,To prove that Michael knew of the existing pat...
92,No,92,Not introduced to prove truth,To prove that Arthur and Mary had a conversati...


In [16]:
prompts = test_df["text"].tolist()
prompts

['On the issue of whether James is an smart individual, the fact that James came first in his class in law school.',
 'On the issue of whether Robert negligently drove, the fact that Robert fell asleep while driving.',
 'On the issue of whether John knew about the conspiracy, the fact that John likes sweatpants.',
 'On the issue of whether Michael was guilty of murder, the fact that Michael left the crime scene immediately.',
 'On the issue of whether William was loved by his community, the fact that he was selected to speak at his graduation.',
 'On the issue of whether Mary robbed the bank, the fact that Mary went to the bank in disguise.',
 'On the issue of whether Patricia was a fan of Coldplay, the fact that she had a poster with the lyrics of "Viva la Vida" on her bedroom wall.',
 "On the issue of whether Jennifer suffered reputational harm from Linda's article, the fact that Linda worked with several different editors to proof read and cross-check her article.",
 "On the issue o

In [19]:
prompts.pop(83)

'To prove that Arthur knew English, the fact that Arthur told Bill (in English) that he thought Mary robbed the bank.'

In [20]:
prompts[83]

'To prove that Arthur had notice that a bank robbery was being planned, the fact that Arthur told Mary that he heard a bank robbery was going to happen.'

## Making the Prompt

In [25]:
def add_labels_and_definitions_to_prompt(prompt_text):
    """
    Add filtered labels and definitions to the prompt.

    Args:
        prompt_text (str): The prompt text.
        filtered_labels (list): List of strings containing labels and definitions.

    Returns:
        str: The full prompt text with filtered labels and definitions added.
    """
    # Initialize full_prompt with prompt_text
    full_prompt = f"""
    Statement : {prompt_text}
    Question: Consider utilizing the following legal ontology classes to frame your argument:\n\n
    """
    
    # Add filtered labels and definitions to the prompt
    filtered_words = remove_stopwords(prompt_text)
    filtered_labels = filter_words_by_substring(filtered_words,owl_df)
    filtered_labels.extend(filter_label_by_substring(owl_df,'hearsay'))
    for label_definition in filtered_labels:
        full_prompt += f"\n{label_definition}"

    # Add the remaining part of the prompt
    full_prompt += """\n
    Use these ontology classes to structure your argument and analyze whether the information provided falls under the category of hearsay.
    
    Output Format: One word (Yes/No) indicating whether the statement is hearsay.

    Answer: 
    """
    #Hearsay or not hearsay
    
    return full_prompt


# Testing - Hearsay

In [26]:
filtered_labels = filter_label_by_substring(owl_df, "hearsay")
filtered_labels

['Motion in Limine to Exclude Hearsay Witness : A Motion in Limine to Exclude Hearsay Witness is a legal request made by one party to prevent the other party from presenting testimony from a witness who will testify about statements made by someone else out of court, which are being offered to prove the truth of the matter asserted.',
 'Motion to Exclude Hearsay Witness : A Motion to Exclude Hearsay Witness is a legal request to prevent a witness from testifying in court based on the fact that their testimony is based on hearsay evidence.']

In [27]:
example = add_labels_and_definitions_to_prompt(prompts[0])
print(example)


    Statement : On the issue of whether James is an smart individual, the fact that James came first in his class in law school.
    Question: Consider utilizing the following legal ontology classes to frame your argument:


    
Credit Issuer Dishonoring or Repudiating : The concept of "Credit Issuer Dishonoring or Repudiating" refers to the act of a bank or other entity failing to honor or refusing to acknowledge a debt or obligation owed to a creditor, typically involving commercial paper or other financial instruments.
B240 Tax Issues : Analyses and advice regarding tax-related issues, including the preservation of net operating loss carry forwards
Motion for New Trial - Evidentiary Issues : A Motion for New Trial - Evidentiary Issues is a request made to a court to reconsider a previous decision based on issues related to the admissibility or sufficiency of evidence presented during the trial.
Saint James : Saint James is a geographic location in Jamaica.
Saint James : The SEC Di

In [28]:
prompt = add_labels_and_definitions_to_prompt(prompts[0])

get_api_response(prompt)

'No'

In [29]:
responses = []

for i, prompt_text in enumerate(prompts):

    full_prompt = add_labels_and_definitions_to_prompt(prompt_text)

    response = get_api_response(full_prompt)
    
    responses.append(response)
    
    print(f"Done for prompt {i+1}")
    # print(response)


Done for prompt 1
Done for prompt 2
Done for prompt 3
Done for prompt 4
Done for prompt 5
Done for prompt 6
Done for prompt 7
Done for prompt 8
Done for prompt 9
Done for prompt 10
Done for prompt 11
Done for prompt 12
Done for prompt 13
Done for prompt 14
Done for prompt 15
Done for prompt 16
Done for prompt 17
Done for prompt 18
Done for prompt 19
Done for prompt 20
Done for prompt 21
Done for prompt 22
Done for prompt 23
Done for prompt 24
Done for prompt 25
Done for prompt 26
Done for prompt 27
Done for prompt 28
Done for prompt 29
Done for prompt 30
Done for prompt 31
Done for prompt 32
Done for prompt 33
Done for prompt 34
Done for prompt 35
Done for prompt 36
Done for prompt 37
Done for prompt 38
Done for prompt 39
Done for prompt 40
Done for prompt 41
Done for prompt 42
Done for prompt 43
Done for prompt 44
Done for prompt 45
Done for prompt 46
Done for prompt 47
Done for prompt 48
Done for prompt 49
Done for prompt 50
Done for prompt 51
Done for prompt 52
Done for prompt 53
Do

In [30]:
print(responses[20])

No


In [31]:
print(responses)

['No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No']


In [38]:
actual_answers = test_df["answer"].tolist()
actual_answers.pop(83)

'No'

In [39]:
evaluate("hearsay", responses, actual_answers[:len(responses)])

0.6768292682926829

In [40]:
# Create a DataFrame
df = pd.DataFrame({
    'Prompt': prompts,
    'Response': responses,
    'Actual Answer': actual_answers
})

In [41]:
df

Unnamed: 0,Prompt,Response,Actual Answer
0,On the issue of whether James is an smart indi...,No,No
1,On the issue of whether Robert negligently dro...,Yes,No
2,On the issue of whether John knew about the co...,No,No
3,On the issue of whether Michael was guilty of ...,No,No
4,On the issue of whether William was loved by h...,No,No
...,...,...,...
88,To prove that Arthur believed that Amy and Dan...,No,No
89,To prove that the trademarks of restaurant A a...,Yes,No
90,To prove that Michael knew of the existing pat...,No,No
91,To prove that Arthur and Mary had a conversati...,Yes,No


In [42]:
df.to_csv('hearsay_results.csv', index=False)