In [11]:
import os
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI


load_dotenv()
google_api_key = os.getenv('GOOGLE_API_KEY')
url = os.getenv('NEO4J_URI')
username = os.getenv('NEO4J_USERNAME')
password = os.getenv('NEO4J_PASSWORD')
neo4j_url = os.getenv('NEO4J_URI_0')
database = 'newvectordb2'

In [28]:
text_node_properties=[
    "court_name",
    "court_abbreviation",
    "case_name",
    "result",
    "case_no",
    "overruled",
    "overruled_by",
    "reportable",
    "petitioner",
    "case_type",
    "respondent",
    "bench",
    "coram",
    "dated",
    "case_no",
    "petitioner_counsel",
    "respondent_counsel",
    "act",
    "petitioners_arguments",
    "courts_reasoning",
    "summary",
    "evidence",
    "respondents_arguments",
    "issues",
    "facts",
    "conclusion",
    "legal_analysis",
    "precedent_analysis",
    "cited",
    "citations",
    "case_referred",
    "keywords",
    "headnotes"
],

In [12]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# vector = embeddings.embed_query(file)

# print(vector)

In [30]:
def flatten_json(data, parent_key='', sep=': '):
    """Recursively flattens a JSON object to a string."""
    items = []
    for key, value in data.items():
        new_key = f"{parent_key}{key}" if parent_key == '' else f"{parent_key}, {key}"
        if isinstance(value, dict):
            # If value is a dict, recursively flatten it
            items.append(flatten_json(value, new_key, sep=sep))
        elif isinstance(value, list):
            # If value is a list, flatten its items
            list_items = ', '.join([str(v) for v in value])
            items.append(f"{new_key}{sep}{list_items}")
        else:
            # Otherwise, just add the key-value pair
            items.append(f"{new_key}{sep}{value}")
    return ', '.join(items)

In [1]:
from pathlib import Path

from neomodel import (
    StructuredNode,
    StringProperty,
    RelationshipTo,
    RelationshipFrom,
    VectorIndex,
    ArrayProperty,
)

import os
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain.prompts import PromptTemplate
from langchain_google_genai import GoogleGenerativeAI
from neomodel import db, config
from langchain_community.document_loaders import BSHTMLLoader

from neomodel.exceptions import UniqueProperty

from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_html_files(file):
    try:
        loader = BSHTMLLoader(file)
        data = loader.load()
        print(f'file: {file} loaded successfully...')
        return data
    except Exception as e:
        print('error in loading file ', e)

In [3]:
# Define the CaseInfo Pydantic model
class CaseInfo(BaseModel):
    case_name: str = Field(description="The name of the case")
    case_no: str = Field(description="The case number")
    result: str = Field(description="The result of the case", default=None)
    overruled: bool = Field(
        description="Whether the case was overruled", default=None)
    overruled_by: str = Field(
        description="The case that overruled this case", default=None
    )
    court_name: str = Field(description="The name of the court", default=None)
    court_abbreviation: str = Field(
        description="Abbreviation of the court name", default=None
    )
    case_type: str = Field(description="The type of the case", default=None)
    petitioner: str = Field(
        description="The petitioner in the case", default=None)
    respondent: str = Field(
        description="The respondent in the case", default=None)
    coram: str = Field(
        description="The coram (judicial bench) of the case", default=None
    )
    petitioner_counsel: list = Field(
        description="The counsel for the petitioner", default=None
    )
    respondent_counsel: list = Field(
        description="The counsel for the respondent", default=None
    )
    act: list = Field(
        description="The relevant act or legislation", default=None)
    bench: list = Field(description="The bench of the court", default=None)
    dated: str = Field(description="The date of the case", default=None)
    reportable: bool = Field(
        description="Whether the case is reportable", default=None)
    evidence: list = Field(
        description="The evidence presented in the case", default=None
    )
    conclusion: str = Field(
        description="The conclusion of the case", default=None)
    courts_reasoning: str = Field(
        description="The court's reasoning in the case", default=None
    )
    precedent_analysis: str = Field(
        description="The analysis of precedents", default=None
    )
    legal_analysis: str = Field(
        description="The legal analysis of the case", default=None
    )
    respondents_arguments: str = Field(
        description="The arguments of the respondent", default=None
    )
    petitioners_arguments: str = Field(
        description="The arguments of the petitioner", default=None
    )
    issues: str = Field(
        description="The issues addressed in the case", default=None)
    facts: str = Field(description="The facts of the case", default=None)
    summary: str = Field(description="The summary of the case", default=None)
    citations: list = Field(
        description="The citations or case no/names related to the case",
        default=None
    )
    cited: list = Field(
        description="Other cases cited in this case, that case name/no",
        default=None
    )
    headnotes: str = Field(description="Headnotes of the case", default=None)
    case_referred: list = Field(
        description="Other cases referred in this case", default=None
    )
    keywords: list = Field(
        description="Keywords describing the case", default=None)


# Template for asking the LLM to extract information
prompt_template = """
You are tasked with extracting legal case information from the document below.
Extract the following fields:
- case_name: (must required)
- case_no: (must required)
- result:
- overruled: (Yes or No)
- overruled_by:
- court_name:
- court_abbreviation:
- case_type:
- petitioner:
- respondent:
- coram:
- petitioner_counsel: []
- respondent_counsel: []
- act: []
- bench: []
- dated: (must required)
- reportable:
- evidence: []
- conclusion:
- courts_reasoning:
- precedent_analysis:
- legal_analysis:
- respondents_arguments:
- petitioners_arguments:
- issues:
- facts:
- summary: (should point out all key aspects of the case)
- citations: []
- cited: []
- headnotes:
- case_referred: []
- keywords:[]

If a field is not found, return null for that field except case name, case no,\
dated. you must find case no and case year for dated and case name.
Document:
{document_text}
"""

In [4]:
def flatten_json(data, parent_key='', sep=': '):
    """Recursively flattens a JSON object to a string."""
    items = []
    for key, value in data.items():
        new_key = f"{parent_key}{key}" if parent_key == '' else f"{parent_key}, {key}"
        if isinstance(value, dict):
            # If value is a dict, recursively flatten it
            items.append(flatten_json(value, new_key, sep=sep))
        elif isinstance(value, list):
            # If value is a list, flatten its items
            list_items = ', '.join([str(v) for v in value])
            items.append(f"{new_key}{sep}{list_items}")
        else:
            # Otherwise, just add the key-value pair
            items.append(f"{new_key}{sep}{value}")
    return ', '.join(items)


def embedding(file):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    text_file = flatten_json(file)
    vector = embeddings.embed_query(text_file)
    # print(text_file)
    # print(vector)
    return text_file, vector


def htm_to_json(html_document):
    # Define the JSON output parser based on the CaseInfo model
    parser = JsonOutputParser(pydantic_object=CaseInfo)

    # Create a prompt template
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["document_text"],
        partial_variables={
            "format_instructions": parser.get_format_instructions()
        },
    )

    # Initialize the LLM
    llm = GoogleGenerativeAI(
        model="models/gemini-1.5-flash-latest", google_api_key=google_api_key
    )

    # Create the LLM chain
    chain = prompt | llm | parser

    response = chain.invoke({"document_text": html_document})
    print('converted into json successfully')
    # print(response)

    if response['case_no'] is None:
        response['case_no'] = response['case_name']

    # text, vector = embedding(response)
    # response['text'] = text
    # response['vector'] = vector

    return response


In [5]:
# Case Node
class Case(StructuredNode):
    case_name = StringProperty()
    case_no = StringProperty(required=True)
    result = StringProperty()
    overruled = StringProperty()
    overruled_by = StringProperty()
    text = StringProperty()
    vector = ArrayProperty()

    appealed_case = RelationshipTo("AppealedCase", "Appeal_to")
    court_data = RelationshipTo("CourtData", "Has_data")
    summary_data = RelationshipTo("SummaryData", "Has_data")
    citations_data = RelationshipTo("CitationsData", "Has_data")


# Court Node
class Court(StructuredNode):
    court_name = StringProperty()
    court_abbreviation = StringProperty()
    text = StringProperty()
    vector = ArrayProperty()

    cases = RelationshipTo("Case", "Is_case")
    appealed_cases = RelationshipTo("AppealedCase", "Is_case")


# AppealedCase Node
class AppealedCase(StructuredNode):
    case_name = StringProperty()
    case_no = StringProperty(required=True)
    result = StringProperty()
    overruled = StringProperty()
    overruled_by = StringProperty()
    text = StringProperty()
    vector = ArrayProperty()

    original_case = RelationshipFrom("Case", "Appeal_from")


# CourtData Node
class CourtData(StructuredNode):
    case_type = StringProperty()
    petitioner = StringProperty()
    respondent = StringProperty()
    coram = StringProperty()
    petitioner_counsel = StringProperty()
    respondent_counsel = StringProperty()
    act = StringProperty()
    bench = StringProperty()
    dated = StringProperty()
    reportable = StringProperty()
    case_no = StringProperty(required=True)
    text = StringProperty()
    vector = ArrayProperty()

    neighbouring_data = RelationshipTo("CitationsData", "Neighbouring_data")
    neighbouring_summary = RelationshipTo("SummaryData", "Neighbouring_data")


# SummaryData Node
class SummaryData(StructuredNode):
    case_no = StringProperty(required=True)
    evidence = StringProperty()
    conclusion = StringProperty()
    courts_reasoning = StringProperty()
    precedent_analysis = StringProperty()
    legal_analysis = StringProperty()
    respondents_arguments = StringProperty()
    petitioners_arguments = StringProperty()
    issues = StringProperty()
    facts = StringProperty()
    summary = StringProperty()
    text = StringProperty()
    vector = ArrayProperty()

    neighbouring_court = RelationshipTo("CourtData", "Neighbouring_data")
    neighbouring_citations = RelationshipTo(
        "CitationsData", "Neighbouring_data")


# CitationsData Node
class CitationsData(StructuredNode):
    case_no = StringProperty(required=True)
    citations = StringProperty()
    cited = StringProperty()
    headnotes = StringProperty()
    case_referred = StringProperty()
    keywords = StringProperty()
    text = StringProperty()
    vector = ArrayProperty()

    neighbouring_summary = RelationshipTo("SummaryData", "Neighbouring_data")
    neighbouring_court = RelationshipTo("CourtData", "Neighbouring_data")


In [6]:
def embed_text(text):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vector = embeddings.embed_query(text)
    return vector

In [7]:
def get_text_embed(case_info):
    court_text = f"""court name: {case_info["court_name"]}, 
        court abbreviation: {case_info["court_abbreviation"]}"""
    court_vector = embed_text(court_text)
    
    case_text = f"""case_name= {case_info["case_name"]},
        case_no= {case_info["case_no"]},
        result= {case_info["result"]},
        overruled= {str(case_info["overruled"])},
        overruled_by= {case_info["overruled_by"]},"""
    case_vector = embed_text(case_text)
    
    court_data_text = f"""case_no= {case_info["case_no"]},
        case_type= {case_info["case_type"]},
        petitioner= {case_info["petitioner"]},
        respondent= {case_info["respondent"]},
        coram= {case_info["coram"]},
        petitioner_counsel= {case_info["petitioner_counsel"]},
        respondent_counsel= {case_info["respondent_counsel"]},
        act= {case_info["act"]},
        bench= {case_info["bench"]},
        dated= {case_info["dated"]},
        reportable= {str(case_info["reportable"])},"""
    court_data_vector = embed_text(case_text)
    
    summary_data_text = f"""case_no= {case_info["case_no"]},
        evidence= {case_info["evidence"]},
        conclusion= {case_info["conclusion"]},
        courts_reasoning= {case_info["courts_reasoning"]},
        precedent_analysis= {case_info["precedent_analysis"]},
        legal_analysis= {case_info["legal_analysis"]},
        respondents_arguments= {case_info["respondents_arguments"]},
        petitioners_arguments= {case_info["petitioners_arguments"]},
        issues= {case_info["issues"]},
        facts= {case_info["facts"]},
        summary= {case_info["summary"]},"""
    summary_data_vector = embed_text(case_text)
    
    citations_data_text = f"""case_no= {case_info["case_no"]},
        citations= {case_info["citations"]},
        cited= {case_info["cited"]},
        headnotes= {case_info["headnotes"]},
        case_referred= {case_info["case_referred"]},
        keywords= {case_info["keywords"]},"""
    citations_data_vector = embed_text(case_text)
    
    return court_text, court_vector, case_text, case_vector, court_data_text, court_data_vector, summary_data_text, summary_data_vector, citations_data_text, citations_data_vector

In [8]:
neo4j_url = os.getenv("NEO4J_URI_0")
config.DATABASE_URL = neo4j_url

In [9]:
# Function to create or get a node (avoiding duplication)
def get_or_create_node(model, **properties):
    try:
        node = model.nodes.get(**properties)
        return node
    except model.DoesNotExist:
        node = model(**properties).save()
        return node
    except UniqueProperty:
        return None


# Function to create nodes and relationships from extracted JSON data
def create_case_graph(case_info):
    court_text, court_vector, case_text, case_vector, court_data_text, court_data_vector, summary_data_text, summary_data_vector, citations_data_text, citations_data_vector = get_text_embed(case_info)
    # Create or retrieve the Court node
    court = get_or_create_node(
        Court,
        court_name=case_info["court_name"],
        court_abbreviation=case_info["court_abbreviation"],
        text=court_text,
        vector=court_vector
    )

    # Create or retrieve the Case node
    case = get_or_create_node(
        Case,
        case_name=case_info["case_name"],
        case_no=case_info["case_no"],
        result=case_info["result"],
        overruled=str(case_info["overruled"]),
        overruled_by=case_info["overruled_by"],
        text=case_text,
        vector=case_vector
    )

    # Create relationship between Court and Case (Is_case)
    if court and case:
        court.cases.connect(case)

    # Create CourtData node
    court_data = get_or_create_node(
        CourtData,
        case_no=case_info["case_no"],
        case_type=case_info["case_type"],
        petitioner=case_info["petitioner"],
        respondent=case_info["respondent"],
        coram=case_info["coram"],
        petitioner_counsel=case_info["petitioner_counsel"],
        respondent_counsel=case_info["respondent_counsel"],
        act=case_info["act"],
        bench=case_info["bench"],
        dated=case_info["dated"],
        reportable=str(case_info["reportable"]),
        text=court_data_text,
        vector=court_data_vector
    )

    # Create relationship between Case and CourtData (Has_data)
    if case and court_data:
        case.court_data.connect(court_data)

    # Create SummaryData node
    summary_data = get_or_create_node(
        SummaryData,
        case_no=case_info["case_no"],
        evidence=case_info["evidence"],
        conclusion=case_info["conclusion"],
        courts_reasoning=case_info["courts_reasoning"],
        precedent_analysis=case_info["precedent_analysis"],
        legal_analysis=case_info["legal_analysis"],
        respondents_arguments=case_info["respondents_arguments"],
        petitioners_arguments=case_info["petitioners_arguments"],
        issues=case_info["issues"],
        facts=case_info["facts"],
        summary=case_info["summary"],
        text=summary_data_text,
        vector=summary_data_vector
    )

    # Create relationship between Case and SummaryData (Has_data)
    if case and summary_data:
        case.summary_data.connect(summary_data)

    # Create CitationsData node
    citations_data = get_or_create_node(
        CitationsData,
        case_no=case_info["case_no"],
        citations=case_info["citations"],
        cited=case_info["cited"],
        headnotes=case_info["headnotes"],
        case_referred=case_info["case_referred"],
        keywords=case_info["keywords"],
        text=citations_data_text,
        vector=citations_data_vector
    )

    # Create relationship between Case and CitationsData (Has_data)
    if case and citations_data:
        case.citations_data.connect(citations_data)

    # Create Neighbouring_data relationships between related data nodes
    if citations_data and court_data:
        citations_data.neighbouring_court.connect(court_data)
        court_data.neighbouring_data.connect(citations_data)
    if citations_data and summary_data:
        citations_data.neighbouring_summary.connect(summary_data)
        summary_data.neighbouring_citations.connect(citations_data)
    if summary_data and court_data:
        summary_data.neighbouring_court.connect(court_data)
        court_data.neighbouring_summary.connect(summary_data)

    print(f"Graph data for {case_info['case_name']} created successfully.")


In [13]:
file_dir_path = './html_data2'
html_files = Path(file_dir_path).glob('*.htm')
for file in html_files:
    try:
        html_data = load_html_files(file)
        json_data = htm_to_json(html_data)
        print('json_data done')
    except Exception as e:
        print(f'Error occurred while extracting {file}: {e}')

    try:
        create_case_graph(json_data)
    except Exception as e:
        print(f'Error occurred while creating graph {file}: {e}')

file: html_data2\1197362.htm loaded successfully...
converted into json successfully
json_data done
Graph data for State of Bihar v. Ranjeet Choudhary & Ors. created successfully.
file: html_data2\1197363.htm loaded successfully...
converted into json successfully
json_data done
Graph data for Cr.W.J.C. No. 657 of 2016 created successfully.
file: html_data2\1197364.htm loaded successfully...
converted into json successfully
json_data done
Graph data for Sasaram Bhabua Central Co-operative Bank Limited v. Assistant Provident Fund Commissioner created successfully.
file: html_data2\1197365.htm loaded successfully...
converted into json successfully
json_data done
Graph data for Nagma Parveen v. State of Bihar created successfully.
file: html_data2\1197366.htm loaded successfully...
converted into json successfully
json_data done
Graph data for State of Bihar v. Sanjay Kumar created successfully.
file: html_data2\1197367.htm loaded successfully...
converted into json successfully
json_dat

In [37]:
from langchain_community.vectorstores import Neo4jVector

# Now we initialize from existing graph
existing_graph = Neo4jVector.from_existing_graph(
    embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"),
    url=url,
    username=username,
    password=password,
    database='newvectordb2',
    node_label="Case",
    text_node_properties=["text", "case_no", "case_name", "result", "overruled", "overruled_by"],
    embedding_node_property="vector",
)

In [38]:
result = existing_graph.similarity_search("State of Bihar v. Ranjeet Choudhary & Ors", k=1)
print(result)

[Document(page_content='\ntext: case_name= State of Bihar v. Ranjeet Choudhary & Ors.,\n        case_no= Cr. Appeal (DB) No. 117 of 2013 & Cr. Appeal (DB) No. 19 of 2013,\n        result= Dismissed,\n        overruled= No,\n        overruled_by= None,\ncase_no: Cr. Appeal (DB) No. 117 of 2013 & Cr. Appeal (DB) No. 19 of 2013\ncase_name: State of Bihar v. Ranjeet Choudhary & Ors.\nresult: Dismissed\noverruled: No\noverruled_by: ')]


In [39]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(
    ChatGoogleGenerativeAI(
        temperature=0,
        api_key=google_api_key,
        model='models/gemini-1.5-flash-latest'
    ), 
    existing_graph.as_retriever(), 
    memory=memory
)

In [41]:
question = "summary of the case State of Bihar v. Ranjeet Choudhary & Ors"
ans = qa({"question": question})["answer"]

print(ans)

The case State of Bihar v. Ranjeet Choudhary & Ors. (Cr. Appeal (DB) No. 117 of 2013 & Cr. Appeal (DB) No. 19 of 2013) was dismissed. 

