# Imports

In [1]:
import datasets
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import re
from evaluation import evaluate
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm
2024-05-01 13:22:31.617713: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ojasva20318/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Constants

In [2]:
PATH_TO_OWL = './LMSS.owl'
LLM_PATH = '../Llama-2-7b-chat-hf'

# Set up Llama

In [3]:
model = AutoModelForCausalLM.from_pretrained(LLM_PATH)
tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

Loading checkpoint shards: 100%|██████████| 6/6 [00:03<00:00,  1.96it/s]


In [4]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer,torch_dtype=torch.float16,device=0)

In [5]:
def get_llama_response(prompt):
    sequences = pipe(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=16384
        # truncation=True
    )
    return sequences[0]['generated_text']

In [6]:
#example
prompt = '''Write me a poem about Machine Learning.'''
answer = get_llama_response(prompt)
print(answer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Write me a poem about Machine Learning.

Machine learning, a wondrous sight
A world of data, a world of might
With algorithms and models so bright
It learns and grows, day and night

It starts with a problem, a challenge to solve
A task to complete, a goal to unfold
It gathers data, a treasure so grand
And learns from it, hand in hand

With each new lesson, it grows in might
A force to be reckoned with, a sight
It adapts and evolves, a true delight
A marvel of science, a work of art in sight

It's a world of wonder, a world of dreams
A future of possibilities, it seems
Machine learning, a true machine
A world of magic, a world of gleam.


# Reading OWL

In [7]:
# Read the contents of the .owl file
with open(PATH_TO_OWL, "r") as owl_file:
    owl_data = owl_file.read()

# Parse the OWL data using BeautifulSoup
soup = BeautifulSoup(owl_data, 'xml')

In [8]:
# Initialize lists to store data
labels = []
definitions = []

# Find all instances of <owl:Class> elements and extract label and definition
for owl_class in soup.find_all('owl:Class'):
    label_element = owl_class.find('rdfs:label')
    definition_element = owl_class.find('skos:definition')
    
    # Check if label and definition elements exist
    if label_element and definition_element:
        label = label_element.text.strip()
        definition = definition_element.text.strip()
        
        # Append data to lists
        labels.append(label)
        definitions.append(definition)

data = {'Label': labels, 'Definition': definitions}
owl_df = pd.DataFrame(data)

owl_df

Unnamed: 0,Label,Definition
0,Other Personal and Household Goods Repair and ...,See industry description for 811490.
1,Other Converted Paper Product Manufacturing,This industry comprises establishments primari...
2,General Medical and Surgical Hospitals,
3,Confectionery Merchant Wholesalers,This industry comprises establishments primari...
4,Other Specialized Design Services,See industry description for 541490.
...,...,...
14248,Vocational Rehabilitation Services,
14249,Books Printing,This U.S. industry comprises establishments pr...
14250,Petrochemical Manufacturing,See industry description for 325110.
14251,Pesticide and Other Agricultural Chemical Manu...,This industry comprises establishments primari...


### Function to get classes

In [9]:
import random


def filter_label_by_substring(df, substring):
    """
    Filter DataFrame rows containing the specified substring in the 'Label' column
    and return a list of strings in the format "{Label} : {Definition}".
    
    Args:
        df (pandas.DataFrame): Input DataFrame.
        substring (str): Substring to search for.
        
    Returns:
        list: List of strings in the format "{Label} : {Definition}" for matching rows.
    """
    safe_substring = re.escape(substring)
    filtered_df = df[df['Label'].str.contains(safe_substring,case=False)]
    output_list = []
    
    if len(filtered_df) <= 3:
        for index, row in filtered_df.iterrows():
            output_list.append(f"{row['Label']} : {row['Definition']}")
    else:
        selected_indices = random.sample(range(len(filtered_df)), 3)
        for idx in selected_indices:
            row = filtered_df.iloc[idx]
            output_list.append(f"{row['Label']} : {row['Definition']}")
    
    return output_list


In [10]:
def filter_words_by_substring(words, df):
    """
    Filter DataFrame rows for each word in the list of words and append the results in a final list.
    
    Args:
        words (list): List of words.
        df (pandas.DataFrame): Input DataFrame.
        
    Returns:
        list: List of strings in the format "{Label} : {Definition}" for matching rows for all words.
    """
    final_output = []
    for word in words:
        output_list = filter_label_by_substring(df, word)
        final_output.extend(output_list)
    return final_output

In [11]:
def remove_stopwords(text, language='english'):
    # Get the stopwords for the specified language
    stopwords_list = set(stopwords.words(language))
    
    # Split the text into words
    words = text.split()
    
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stopwords_list]
    
    return filtered_words

# Dataset

In [12]:
dataset_causality = datasets.load_dataset("nguha/legalbench", "legal_reasoning_causality")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [13]:
test_df = dataset_causality['test'].to_pandas()
test_df

Unnamed: 0,answer,index,text
0,Yes,0,This review indicates that the statements in S...
1,Yes,1,A statistical study is not inadmissible merely...
2,Yes,2,Equally without evidentiary significance is th...
3,Yes,3,The expert's failure to make any adjustment fo...
4,Yes,4,Dr. Vekker analyzes only correlation and is up...
5,Yes,5,Mariner's [**21] explanation is not so convin...
6,Yes,6,"Here, the scope and specificity of the plainti..."
7,Yes,7,To explore a possible cause and effect relatio...
8,Yes,8,"In the Sixth Circuit, it is required that in o..."
9,Yes,9,One nondiscriminatory reason advanced by defen...


In [14]:
prompts = test_df["text"].tolist()
prompts

['This review indicates that the statements in Sheehan and People Who Care that an expert\'s failure to consider variables other than the salient characteristic renders his opinions inadmissible are outliers. HN26 There are some common themes in the cases: statistics play an important role in a plaintiffs\' ability to make a prima facie case of pattern-or-practice discrimination; experts should carefully formulate their initial hypotheses; experts should consider variables other than the salient characteristic and their failure to do so weakens and renders less probative their conclusions. But particularly when plaintiffs have other evidence (such as individual testimony about personal experiences of discrimination), the weight of decision seems to treat failure to consider other variables as a factor going to the weight of the statistical evidence and not as a factor that renders the evidence inadmissible. The court concludes that Neumark\'s failure to follow what the Seventh Circuit 

In [15]:
prompts[0]

'This review indicates that the statements in Sheehan and People Who Care that an expert\'s failure to consider variables other than the salient characteristic renders his opinions inadmissible are outliers. HN26 There are some common themes in the cases: statistics play an important role in a plaintiffs\' ability to make a prima facie case of pattern-or-practice discrimination; experts should carefully formulate their initial hypotheses; experts should consider variables other than the salient characteristic and their failure to do so weakens and renders less probative their conclusions. But particularly when plaintiffs have other evidence (such as individual testimony about personal experiences of discrimination), the weight of decision seems to treat failure to consider other variables as a factor going to the weight of the statistical evidence and not as a factor that renders the evidence inadmissible. The court concludes that Neumark\'s failure to follow what the Seventh Circuit a

In [16]:
import re

def remove_brackets(strings):
    # Define the pattern to remove square and round brackets
    pattern = r'[\[\]()]'
    # Use list comprehension to apply the removal to each string in the list
    return [re.sub(pattern, '', string) for string in strings]


In [17]:
prompts = remove_brackets(prompts)
prompts

['This review indicates that the statements in Sheehan and People Who Care that an expert\'s failure to consider variables other than the salient characteristic renders his opinions inadmissible are outliers. HN26 There are some common themes in the cases: statistics play an important role in a plaintiffs\' ability to make a prima facie case of pattern-or-practice discrimination; experts should carefully formulate their initial hypotheses; experts should consider variables other than the salient characteristic and their failure to do so weakens and renders less probative their conclusions. But particularly when plaintiffs have other evidence such as individual testimony about personal experiences of discrimination, the weight of decision seems to treat failure to consider other variables as a factor going to the weight of the statistical evidence and not as a factor that renders the evidence inadmissible. The court concludes that Neumark\'s failure to follow what the Seventh Circuit ap

In [18]:
def add_labels_and_definitions_to_prompt(prompt_text):
    """
    Add filtered labels and definitions to the prompt.

    Args:
        prompt_text (str): The prompt text.
        filtered_labels (list): List of strings containing labels and definitions.

    Returns:
        str: The full prompt text with filtered labels and definitions added.
    """
    # Initialize full_prompt with prompt_text
    full_prompt = f"""
    Opinion : {prompt_text}
    Question: Consider utilizing the following legal ontology classes to frame your argument:\n\n
    """
    
    # Add filtered labels and definitions to the prompt
    filtered_words = remove_stopwords(prompt_text)
    #print('done swr')
    filtered_labels = filter_words_by_substring(filtered_words,owl_df)
    #print('done fwbs')
    filtered_labels.extend(filter_label_by_substring(owl_df,'evidence'))
    #print('done flbs')
    filtered_labels.extend(filter_label_by_substring(owl_df,'causation'))
    #print('done flbs')

    for label_definition in filtered_labels:
        full_prompt += f"\n{label_definition}"

    # Add the remaining part of the prompt
    full_prompt += """
    Use these ontology classes to structure your argument and analyze whether the following opinion excerpts rely on statistical evidence? Think Step by Step.
    
    Output Format: First word should be Yes/No and then the reason next line onwards.

    Answer 
    """
    
    return full_prompt


# Testing - Causality

In [19]:
example = add_labels_and_definitions_to_prompt(prompts[1])
print(example)


    Opinion : A statistical study is not inadmissible merely because it is unable to exclude all possible causal factors other than the one of interest. But a statistical study that fails to correct for salient explanatory variables, or even to make the most elementary comparisons, has no value as causal explanation and  *538  is therefore inadmissible in a federal court. The idea that the educational deficiencies of minority students in the Rockford public schools are due primarily to discrimination by the school authorities and can be rectified by an equitable decree is at once unsubstantiated by responsible evidence and--since there is no evidence that these deficiencies are any greater than in school districts around the country that have not been held to have discriminated against minority students-- implausible.

Tracking might be adopted in order to segregate the races. The well-known correlation between race and academic performance makes tracking, even when implemented in acc

In [20]:
prompt = add_labels_and_definitions_to_prompt(prompts[0])

get_llama_response(prompt)

RuntimeError: The size of tensor a (8192) must match the size of tensor b (11558) at non-singleton dimension 3

In [None]:
responses = []

for i, prompt_text in enumerate(prompts):

    full_prompt = add_labels_and_definitions_to_prompt(prompt_text)

    response = get_llama_response(full_prompt)
    
    responses.append(response)
    
    print(f"Done for prompt {i+1}")
    # print(response)


Done for prompt 1
Done for prompt 2
Done for prompt 3




Done for prompt 4
Done for prompt 5
Done for prompt 6
Done for prompt 7
Done for prompt 8
Done for prompt 9
Done for prompt 10
Done for prompt 11
Done for prompt 12
Done for prompt 13
Done for prompt 14
Done for prompt 15
Done for prompt 16
Done for prompt 17
Done for prompt 18
Done for prompt 19
Done for prompt 20
Done for prompt 21
Done for prompt 22
Done for prompt 23
Done for prompt 24
Done for prompt 25
Done for prompt 26
Done for prompt 27
Done for prompt 28
Done for prompt 29
Done for prompt 30
Done for prompt 31
Done for prompt 32
Done for prompt 33
Done for prompt 34
Done for prompt 35
Done for prompt 36
Done for prompt 37
Done for prompt 38
Done for prompt 39
Done for prompt 40
Done for prompt 41
Done for prompt 42
Done for prompt 43
Done for prompt 44
Done for prompt 45
Done for prompt 46
Done for prompt 47
Done for prompt 48
Done for prompt 49
Done for prompt 50
Done for prompt 51
Done for prompt 52
Done for prompt 53
Done for prompt 54
Done for prompt 55
Done for prompt 56

### Parsing output

In [None]:
def check_not_hearsay(phrases):
    results = []
    # Use regex to find "Answer:" followed by "not hearsay" with case insensitivity
    pattern = r"Answer:.*not hearsay"
    for phrase in phrases:
        match = re.search(pattern, phrase, re.IGNORECASE)
        if match:
            results.append("No")
        else:
            results.append("Yes")
    return results


In [None]:
extracted_answers = check_not_hearsay(responses)

In [None]:
print(extracted_answers)

['Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes']


In [None]:
# # only if NaN or Unknowns are present
# for i in range(len(extracted_answers)):
#     if extracted_answers[i] == 'Unknown':
#         extracted_answers[i] = random.choice(['Yes', 'No'])

In [None]:
# print(extracted_answers)

In [None]:
actual_answers = test_df["answer"].tolist()
actual_answers.pop(83)

'No'

In [None]:
evaluate("hearsay", extracted_answers, actual_answers[:len(extracted_answers)])

0.5358818011257036