In [1]:
import os
os.chdir('..')
notebook_dir = os.getcwd()

In [2]:
import pandas as pd
from src import openAIHandler



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from src import openAIHandler


In [3]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
def BuildVectorDB(directory,legislation_list):
    #directory = data/legislation
    def load_legislative_sections(directory, legislation_number):
        sections = []
        for filename in os.listdir(directory):
            if filename.endswith(".txt"):
                try:
                    section_number = filename.split('-')[1].split('.')[0]  # Extract section number
                    with open(os.path.join(directory, filename), 'r') as file:
                        text = file.read().strip()  # Read the content of the file
                        sections.append({
                            "id": f"{legislation_number}_section_{section_number}",
                            "text": text,
                            "legislation_id": legislation_number
                        })
                except:
                    pass
        return sections

    docs = []
    for legislation_number in legislation_list:
        try:
            #print(legislation_number)
            legislative_sections = load_legislative_sections(
                f"{directory}/{legislation_number}", legislation_number
            )
            doc = [
                Document(page_content=sec["text"], 
                         metadata={
                             "id": sec["id"],
                             "legislation_id": sec["legislation_id"]  # Add legislation_id to metadata
                         }) 
                for sec in legislative_sections
            ]
            docs.extend(doc)
        except Exception as e:
            print(f"Error processing legislation {legislation_number}: {str(e)}")

    try:
        embeddings = openAIHandler.getEmbeddings()
        vectorstore = FAISS.from_documents(docs, embeddings)
    except Exception as e:
        print(f"Error creating vector store: {str(e)}")
        raise
    
    return vectorstore


In [4]:


def flatten_list_of_lists(list_of_lists):
    """
    Flattens a list of lists into a single list containing all the values.

    Args:
        list_of_lists (list): A list where each element is a list.

    Returns:
        list: A single list containing all the values from the input lists.
    """
    return [item for sublist in list_of_lists for item in sublist]
import pickle
with open('data/cleaned_case_legislation_map_2020data_round6.pkl', 'rb') as f:
    case_legislation_dic = pickle.load(f)
acts = list(set(flatten_list_of_lists(case_legislation_dic.values())))

In [5]:
print(acts)
vectore_store = BuildVectorDB(acts)

['1973/45', '1985/60', '1984/60', '1976/50', '1989/40', '1983/19', '1996/27', '2002/38', '1986/55', '2000/14', 'Geo5/15-16/23', '1976/36', '1964/81', '2014/6', '1981/54', '2015/9', '1983/20', '1998/42', '2003/31', 'Eliz2/8-9/65', '1986/45', '2005/9', '1989/41']


  warn_deprecated(


In [6]:
def get_relevantSection(query, legislation_filter_list):
    relevant_doc = None
    try:
        score = 1
        for legislation in legislation_filter_list:
            results = vectore_store.similarity_search_with_score(
                query=query,
                k=1,
                filter={"legislation_id": legislation}
            )
            #print(results)

            #print(legislation)
            #print(results)
            if len(results)>0:
                doc,score_r = results[0]

                if score_r < score:
                    score = score_r
                    relevant_doc = doc

                
    except Exception as e:
        print(f"Error in get_relevantSection: {e}")
        relevant_doc = None  # Return None if error occurred in search
            
    return relevant_doc

#This is a little test case 
legislation_filter_list = ['1989/41','1976/63']
query = "The Act of 1972 has effect on and after exit day as if — (a) the definitions of “the Treaties” and “the EU Treaties” given by section 1(2) to " #2018/16
query2 = "The mother is concerned about the welfare of the child in all circumstances"
query3 = "The Court of Appeal have confirmed that there is no presumption, as there is for parents, that a grandparent who has obtained the leave of the court to apply for a Child Arrangements Order should be entitled to contact unless there are cogent reasons for denying it to them  (Re A (Section 8 Order: Grandparent Application)   [1995] 2 FLR 153 )."
doc = get_relevantSection(query3, legislation_filter_list)
print(doc.metadata)

{'id': '1989/41_section_10', 'legislation_id': '1989/41'}


In [7]:
case_list = list(case_legislation_dic.keys())

In [8]:
len(case_list)

26

In [9]:
import pandas as pd
import ast
def process_case_annotations(case_number, input_dir, output_dir, case_legislation_dic):
        test_case = f'{input_dir}/{case_number}.csv'
        print(f"processing {test_case}")
        annotations_df_gpt=pd.read_csv(test_case)
        annotations_df_gpt['section_id'] = 0 
        annotations_df_gpt['section_text'] = ''
        legislation_list = case_legislation_dic[case_number]
        
        for i,row in annotations_df_gpt.iterrows():
            if (row['if_interpretation'] == 1):
                try:
                    paragraph = row['paragraphs']
                    references = row.get('references',[])
                    references = ast.literal_eval(references)
                    
                    if len(references)>0:
                        # Extract legislation sections from references if available
                        for ref in references:
                            if isinstance(ref, dict) and 'legislation_section' in ref:
                                legislation_id, section = ref['legislation_section']
                                if legislation_id:
                                    section_id = f"{legislation_id}_{section}"
                                    # Get section text using get_relevantSection
                                    relevant_doc = get_relevantSection(paragraph, [legislation_id])
                                    if relevant_doc:
                                        section_text = relevant_doc.page_content
                                        annotations_df_gpt.at[i, 'section_id'] = section_id
                                        annotations_df_gpt.at[i, 'section_text'] = section_text
                                    break
                    else:
                        # Fall back to original behavior if no references
                        relevant_doc = get_relevantSection(paragraph, legislation_list)
                        if relevant_doc:
                            section_id = relevant_doc.metadata.get("id", "unknown")
                            section_text = relevant_doc.page_content
                            annotations_df_gpt.at[i, 'section_id'] = section_id
                            annotations_df_gpt.at[i, 'section_text'] = section_text
                except:
                    pass
                    
        output_file = f'{output_dir}/ewhc_fam_2020_{case_number}_sections.csv'
        annotations_df_gpt.to_csv(output_file, index=False)
input_dir = "data/case_laws/csvs/xml_to_csv/2020"
output_dir = "data/case_laws/csvs/2020"
for case_number in case_list:
    process_case_annotations(case_number, input_dir, output_dir, case_legislation_dic)
    

processing data/case_laws/csvs/xml_to_csv/2020/1012.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1238.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/182.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/220.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1548.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/3005.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1599.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/877.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/323.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1098.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/252.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1116.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/2741.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/2968.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/881.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/3257.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1287.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1903.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/3496.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/3195.csv
processing data/case_laws/csvs/xml_to_csv/2020/1346.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/1805.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/162.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/2878.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/3379.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


processing data/case_laws/csvs/xml_to_csv/2020/574.csv


  annotations_df_gpt.at[i, 'section_id'] = section_id


In [11]:

import json

def checkIfMatch(phrase,text):
    if phrase.lower() in text.lower():
        return True
    else:
        return False
def getIflegit(results,case_text,legislation_text):
    legitresults = []
    for result in results:
        case_Phrase = result['case_law_term']
        legilation_phrase = result['legislation_term']
        
        if checkIfMatch(case_Phrase, case_text) and checkIfMatch(legilation_phrase, legislation_text):
            legitresults.append(result)
    return legitresults



In [12]:
import re
def getJsonList(results_str):
    try:
        results = json.loads(results_str)
        return results
    except:
        match = re.search(r'```json\n(.*?)\n```', results_str, re.S)
        if match:
            json_string = match.group(1)
        try:
            # Parse the extracted JSON string
            json_data = json.loads(json_string)
            print("Successfully extracted JSON list:")
            return json_data

           
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            return []

In [13]:
llm_chain_extraction = openAIHandler.getPhraseExtractionChain()
def processToGetTriples(case_number, llm_chain_extraction, input_folder, output_folder):
    annotations_df_gpt = pd.read_csv(f'{input_folder}/ewhc_fam_2020_{case_number}_sections.csv',index_col=False)
    annotations_df_gpt['triples_result'] = ''
    for i ,row in annotations_df_gpt.iterrows():
        para_id =row['para_id']
        case_text = row['paragraphs']
        legislation_text = row['section_text']
        section_id = row['section_id']
        
        if section_id != '0':
        #print(text)
        #print(section_text)
            try:
                RESULTS = openAIHandler.getInterPretations(legislation_text,case_text,llm_chain_extraction)
                print(para_id)
                print(section_id)
                print("===========================")

                results = getJsonList(RESULTS)
                RESULTS_legit = getIflegit(results,case_text,legislation_text)
                #print(RESULTS_legit)
                annotations_df_gpt.at[i, 'triples_result'] = RESULTS_legit
            except Exception as e:
                print(f"Error occurred: {e}")
                continue
    annotations_df_gpt.to_csv(f'{output_folder}/ewhc_fam_2020_{case_number}_sections.csv',index=False)


  warn_deprecated(


In [14]:
import time
for case in case_list:
    print(f"processing case {case}")
    output_dir = "data/case_laws/csvs/2020"
    processToGetTriples(case, llm_chain_extraction, output_dir, output_dir)
    time.sleep(30)

processing case 1012
para_16
2000/14_None
Successfully extracted JSON list:
para_17
2000/14_11
Successfully extracted JSON list:
para_31
1989/41_section_44
Successfully extracted JSON list:
para_32
1983/20_section_137
Successfully extracted JSON list:
processing case 1238
para_16
1989/41_section_49
Successfully extracted JSON list:
para_24
1989/41_1/1
Successfully extracted JSON list:
para_25
1989/41_section_23CZB
Successfully extracted JSON list:
para_27
1989/41_section_100
Successfully extracted JSON list:
para_29
1989/41_section_30
Successfully extracted JSON list:
para_30
1989/41_None
Successfully extracted JSON list:
para_31
1989/41_section_1
Successfully extracted JSON list:
para_33
1989/41_section_9
Successfully extracted JSON list:
para_44
1989/41_section_11D
para_47
1989/41_section_100
Successfully extracted JSON list:
para_64
1989/41_section_8
Successfully extracted JSON list:
para_66
1989/41_8
Successfully extracted JSON list:
para_71
1989/41_section_11N
Successfully extract

In [15]:
import ast
def getTheInterpretationDf(dataframe):
    # Filter rows where 'triples_result' is not NaN
    filtered_df = dataframe[dataframe['triples_result'].notna()]

    # Initialize a list to store the extracted data
    extracted_data = []

    # Iterate over each row in the filtered DataFrame
    for _, row in filtered_df.iterrows():
        # Parse the 'triples_result' JSON string into a list of dictionaries
        triples = ast.literal_eval(row['triples_result'])

        # Extract relevant fields from each triple
        for triple in triples:
            try:
                legislation_phrases =triple['key_phrases/concepts']
            except:
                legislation_phrases = triple['key_phrases']
                

            case_term = triple.get('case_law_term', '')
            legislation_term = triple.get('legislation_term', '')
            confidence = triple.get('confidence', '')
            reasoning = triple.get('reasoning', '')
            #legislation_phrases = triple.get('key_phrases/concepts', [])
            

            # Append the extracted data along with additional information to the list
            extracted_data.append({
                'url': row.get('case_uri', ''),
                'para_id': row.get('para_id', ''),
                'paragraphs': row.get('paragraphs', ''),
                'case_term_phrases': row.get('interpretation_phrases', ''),
                'legislation_id': row.get('section_id', ''),
                'section_text':row.get('section_text', ''),
                'case_term': case_term,
                'legislation_term': legislation_term,
                'confidence': confidence,
                'reasoning': reasoning,
                'key_phrases': legislation_phrases
            })

    # Create a new DataFrame from the extracted data
    new_dataframe = pd.DataFrame(extracted_data)

    # Return the new DataFrame
    return new_dataframe




In [3]:
import pickle
with open('data/cleaned_case_legislation_map.pkl', 'rb') as f:
    case_legislation_dic = pickle.load(f)
case_numbers = list(case_legislation_dic.keys())

In [12]:
import pandas as pd

# Read the CSV file
triples_df = pd.read_csv('data/2024-familylaw-12cases-triples.csv')

# Extract case numbers from URLs
case_numbers = []
for url in triples_df['url'].unique():
    try:
        # Assuming URLs follow a pattern like "...ewhc_fam_2024_123..."
        case_number = url.split('/')[-1]
        case_numbers.append(case_number)
    except:
        continue

# Remove duplicates and sort
case_numbers = sorted(list(set(case_numbers)))
print(f"Found {len(case_numbers)} unique case numbers")


Found 12 unique case numbers


In [16]:
import pandas as pd
def new_func(case_numbers):
    final_df = pd.DataFrame()
    for case_number in case_numbers:
        print(f"===========case {case_number}=====================")
        dataframe = pd.read_csv(f'data/case_laws/csvs/2024/ewhc_fam_2024_{case_number}_sections.csv',index_col=False)
        selected_rows_df = getTheInterpretationDf(dataframe)
        #Add the selected rows in the Final DataFrame
        final_df = pd.concat([final_df, selected_rows_df], ignore_index=True)
    return final_df


final_df = new_func(case_numbers)



In [17]:
final_df.to_csv('data/final_df_2024_round1.csv')

In [19]:
len(final_df)

247