## Automated pipeline for Sri Lankan Appeal Court Judgemnt Extraction


### Initial Imports

In [1]:
# !python3 -m pip install --upgrade pip

In [2]:
# %pip install crewai crewai-tools python-dotenv

In [3]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

import os
from dotenv import load_dotenv, find_dotenv

import os
import yaml
from crewai import Agent, Task, Crew

In [4]:
# these expect to find a .env file at the directory above the lesson.                                                                                                                     # the format for that file is (without the comment)                                                                                                                                       #API_KEYNAME=AStringThatIsTheLongAPIKeyFromSomeService                                                                                                                                     
def load_env():
    _ = load_dotenv(find_dotenv())

def get_openai_api_key():
    load_env()
    openai_api_key = os.getenv("OPENAI_API_KEY")
    return openai_api_key

### Set OpenAI Model

In [5]:
os.environ['OPENAI_MODEL_NAME'] = 'gpt-4o-mini'

### Loading Tasks and Agents YAML files

In [6]:
# Define file paths for YAML configurations
files = {
    'agents': 'config/agents.yaml',
    'tasks': 'config/tasks.yaml'
}

# Load configurations from YAML files
configs = {}
for config_type, file_path in files.items():
    with open(file_path, 'r') as file:
        configs[config_type] = yaml.safe_load(file)

# Assign loaded configurations to specific variables
agents_config = configs['agents']
tasks_config = configs['tasks']

### Create Pydantic Models for Structured Output

In [7]:
from typing import List
from pydantic import BaseModel, Field

class SummaryJson(BaseModel):
    summary: str = Field(..., title="Summary", description="The summary of the leagl document text.")

class ClassifiedSummaryJson(BaseModel):
    summary: str = Field(..., title="Summary", description="The summary of the legal document text.")
    classification: str = Field(title="Classification", description="The classification of the legal document into commercial or civil", pattern="^(civil|commercial)$")

class CatagorizedDataJson(BaseModel):
    summary: str = Field(..., title="Summary", description="The summary of the legal document text.")
    classification: str = Field(title="Classification", description="The classification of the legal document into commercial or civil", pattern="^(civil|commercial)$")
    category: str = Field(title="Category", description="The category of the legal document into 'Intellectual Property', 'Contract Dispute', 'Employment Law'")


In [8]:
class TaskEstimate(BaseModel):
    task_name: str = Field(..., description="Name of the task")
    estimated_time_hours: float = Field(..., description="Estimated time to complete the task in hours")
    required_resources: List[str] = Field(..., description="List of resources required to complete the task")

class Milestone(BaseModel):
    milestone_name: str = Field(..., description="Name of the milestone")
    tasks: List[str] = Field(..., description="List of task IDs associated with this milestone")

class ProjectPlan(BaseModel):
    tasks: List[TaskEstimate] = Field(..., description="List of tasks with their estimates")
    milestones: List[Milestone] = Field(..., description="List of project milestones")

In [9]:
from crewai_tools import FileReadTool

# Initialize the tool to read any files the agents knows or lean the path for
file_read_tool = FileReadTool()

In [10]:
from crewai_tools import FileWriterTool

# Initialize the tool
file_writer_tool = FileWriterTool()

### Create Crew, Agents and Tasks

In [None]:
# Creating Agents
summarizer_agent = Agent(
  config=agents_config['summarizer_agent']
)

categorizer_agent = Agent(
  config=agents_config['categorizer_agent']
)

filter_agent = Agent(
  config=agents_config['filter_agent']
)

classifier_agent = Agent(
  config=agents_config['classifier_agent']
)

# Creating Tasks
summarization_task = Task(
  config=tasks_config['summarization_task'],
  agent=summarizer_agent,
  tools=[file_read_tool],
  output_pydantic=SummaryJson
)

categorization_task = Task(
  config=tasks_config['categorization_task'],
  agent=categorizer_agent,
  output_pydantic=ClassifiedSummaryJson
)

filtering_task = Task(
  config=tasks_config['filtering_task'],
  agent=filter_agent,
  output_pydantic=ClassifiedSummaryJson
)

classification_task = Task(
  config=tasks_config['classification_task'],
  agent=classifier_agent,
  tools=[file_writer_tool],
  output_pydantic=CatagorizedDataJson
)

# Creating Crew
crew = Crew(
  agents=[
    summarizer_agent,
    categorizer_agent,
    filter_agent,
    classifier_agent
  ],
  tasks=[
    summarization_task,
    categorization_task,
    filtering_task,
    classification_task
  ],
  verbose=True
)

### Crew's Inputs

In [19]:
file_path = 'downloads/txt_files/2024/01/139_21_pdf.txt'

In [13]:
# from IPython.display import display, Markdown

# project = 'Website'
# industry = 'Technology'
# project_objectives = 'Create a website for a small business'
# team_members = """
# - John Doe (Project Manager)
# - Jane Doe (Software Engineer)
# - Bob Smith (Designer)
# - Alice Johnson (QA Engineer)
# - Tom Brown (QA Engineer)
# """
# project_requirements = """
# - Create a responsive design that works well on desktop and mobile devices
# - Implement a modern, visually appealing user interface with a clean look
# - Develop a user-friendly navigation system with intuitive menu structure
# - Include an "About Us" page highlighting the company's history and values
# - Design a "Services" page showcasing the business's offerings with descriptions
# - Create a "Contact Us" page with a form and integrated map for communication
# - Implement a blog section for sharing industry news and company updates
# - Ensure fast loading times and optimize for search engines (SEO)
# - Integrate social media links and sharing capabilities
# - Include a testimonials section to showcase customer feedback and build trust
# """

# # Format the dictionary as Markdown for a better display in Jupyter Lab
# formatted_output = f"""
# **Project Type:** {project}

# **Project Objectives:** {project_objectives}

# **Industry:** {industry}

# **Team Members:**
# {team_members}
# **Project Requirements:**
# {project_requirements}
# """
# # Display the formatted output as Markdown
# display(Markdown(formatted_output))

### Kicking off the crew


In [14]:
# # The given Python dictionary
# inputs = {
#   'project_type': project,
#   'project_objectives': project_objectives,
#   'industry': industry,
#   'team_members': team_members,
#   'project_requirements': project_requirements
# }

# # Run the crew
# result = crew.kickoff(
#   inputs=inputs
# )

In [20]:
inputs = {
  'file_path': file_path
}

# Run the crew
result = crew.kickoff(
  inputs=inputs
)

[1m[95m# Agent:[00m [1m[92mLegal Summarizer[00m
[95m## Task:[00m [92mSummarize the given legal judgment. Your summary should capture key details succinctly. Then export the summarized text into a json file. the file to be read is downloads/txt_files/2024/01/139_21_pdf.txt.
[00m


[1m[95m# Agent:[00m [1m[92mLegal Summarizer[00m
[95m## Thought:[00m [92mI need to read the content of the legal judgment from the specified file path to summarize it accurately.[00m
[95m## Using tool:[00m [92mRead a file's content[00m
[95m## Tool Input:[00m [92m
"{\"file_path\": \"downloads/txt_files/2024/01/139_21_pdf.txt\"}"[00m
[95m## Tool Output:[00m [92m
Page 1 of 7 
 
IN THE COURT OF APPEAL OF THE DEMOCRATIC SOCIALIST REPUBLIC OF 
SRI LANKA 
 
In the matter of an application for mandates in 
the nature of Writs of Certiorari and Mandamus 
under and in terms of Article 140 of the 
Constitution.  
1. Ajith Siyambalapitiya 
Honorary Secretary 
2. Lahiru Silva 
Sports Club Mana

In [24]:
result.pydantic

CatagorizedDataJson(summary='The case CA/WRIT/139/2021 pertains to a civil dispute regarding club membership and tournament participation.', classification='civil', category='Other')

In [26]:
import pandas as pd

# Directory containing TXT files and the path to the CSV file
txt_directory = "downloads/txt_files/2024/01/"
output_csv = "output.csv"

# List of all TXT files in the directory
txt_files = [os.path.join(txt_directory, f) for f in os.listdir(txt_directory) if f.endswith(".txt")]
print(txt_files)

# Create an empty CSV file if it doesn't exist
if not os.path.exists(output_csv):
    pd.DataFrame().to_csv(output_csv, index=False)  # Start with an empty DataFrame

['downloads/txt_files/2024/01/ca_wrt_0635_21_pdf.txt', 'downloads/txt_files/2024/01/hcc_0384_18_final_judgement_pdf.txt', 'downloads/txt_files/2024/01/ca_writ_87_22_pdf.txt', 'downloads/txt_files/2024/01/writ_138_20_pdf.txt', 'downloads/txt_files/2024/01/writ_345_21_pdf.txt', 'downloads/txt_files/2024/01/ca_wrt_0395_19_and_ca_wrt_0126_20_pdf.txt', 'downloads/txt_files/2024/01/cpa_0132_23_final_judgement_pdf.txt', 'downloads/txt_files/2024/01/rii_03_2017_judgment_summary_pdf.txt', 'downloads/txt_files/2024/01/hcc_0056_21_final_judgement_pdf.txt', 'downloads/txt_files/2024/01/ca_writ_789_23_pdf.txt', 'downloads/txt_files/2024/01/ca_phc_0038_17_final_judgement_pdf.txt', 'downloads/txt_files/2024/01/wrt_0471_19_pdf.txt', 'downloads/txt_files/2024/01/ca_writ_0591_21_pdf.txt', 'downloads/txt_files/2024/01/ca_writ_464_21_pdf.txt', 'downloads/txt_files/2024/01/139_21_pdf.txt', 'downloads/txt_files/2024/01/ca_161_2018_pdf.txt', 'downloads/txt_files/2024/01/ca_writ_0451_20_pdf.txt', 'downloads/t

In [28]:
import os
import pandas as pd

# Define the columns for the DataFrame
columns = ['summary', 'classification', 'category']

# Iteratively process each TXT file
for txt_file in txt_files:
    print(f"Processing TXT file: {txt_file}")

    with open(txt_file, 'r') as file:
        txt_data = file.read()

    # Execute the Crew with the TXT data as input
    result = crew.kickoff(inputs={'file_path': txt_file})

    # Extract the pydantic data
    pydantic_data = result.pydantic

    # Convert the pydantic data to a dictionary
    data_dict = {
        'summary': pydantic_data.summary,
        'classification': pydantic_data.classification,
        'category': pydantic_data.category
    }

    # Check if the CSV file exists and is not empty
    if os.path.exists(output_csv) and os.path.getsize(output_csv) > 0:
        # Load the existing CSV file
        existing_df = pd.read_csv(output_csv)
    else:
        # Create an empty DataFrame with the defined columns
        existing_df = pd.DataFrame(columns=columns)

    # Append the new result to the DataFrame
    new_data = pd.DataFrame([data_dict])  # Convert the dictionary to a DataFrame
    updated_df = pd.concat([existing_df, new_data], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    updated_df.to_csv(output_csv, index=False)

print(f"All TXT files have been processed and results saved to {output_csv}.")

Processing TXT file: downloads/txt_files/2024/01/ca_wrt_0635_21_pdf.txt
[1m[95m# Agent:[00m [1m[92mLegal Summarizer[00m
[95m## Task:[00m [92mSummarize the given legal judgment. Your summary should capture key details succinctly. Then export the summarized text into a json file. the file to be read is downloads/txt_files/2024/01/ca_wrt_0635_21_pdf.txt.
[00m


[1m[95m# Agent:[00m [1m[92mLegal Summarizer[00m
[95m## Thought:[00m [92mI need to read the content of the specified file to extract the legal judgment details for summarization.[00m
[95m## Using tool:[00m [92mRead a file's content[00m
[95m## Tool Input:[00m [92m
"{\"file_path\": \"downloads/txt_files/2024/01/ca_wrt_0635_21_pdf.txt\"}"[00m
[95m## Tool Output:[00m [92m
C.A. WRIT NO. 635-21 
 
 1 
 
IN THE COURT OF APPEAL OF THE  
DEMOCRATIC SOCIALIST REPUBLIC OF SRI LANKA 
 
In the matter of an application for  Writs 
of Certiorari and Mandamus under and in 
terms of Article 140 of the Constitution of 


### Usage Metrics and Costs

##### Let’s see how much it would cost each time if this crew runs at scale.


In [12]:
import pandas as pd

costs = 0.150 * (crew.usage_metrics.prompt_tokens + crew.usage_metrics.completion_tokens) / 1_000_000
print(f"Total costs: ${costs:.4f}")

# Convert UsageMetrics instance to a DataFrame
df_usage_metrics = pd.DataFrame([crew.usage_metrics.dict()])
df_usage_metrics

Total costs: $0.0010


Unnamed: 0,total_tokens,prompt_tokens,completion_tokens,successful_requests
0,6719,5132,1587,19


### Result


In [13]:
result.pydantic.dict()

{'summary': 'Contract Dispute refers to a disagreement between parties regarding the terms, execution, or obligations established in a contract. It typically arises when one party believes that the other party has not fulfilled their contractual duties. These disputes can involve various issues, including non-payment, quality of work, delays, and interpretation of contract terms. Resolution may involve negotiations, mediation, arbitration, or litigation.',
 'classification': 'commercial',
 'category': 'Contract Dispute'}

### Inspect further


In [14]:
tasks = result.pydantic.dict()['tasks']
df_tasks = pd.DataFrame(tasks)

# Display the DataFrame as an HTML table
df_tasks.style.set_table_attributes('border="1"').set_caption("Task Details").set_table_styles(
    [{'selector': 'th, td', 'props': [('font-size', '120%')]}]
)

KeyError: 'tasks'

#### Inspecting Milestones


In [17]:
milestones = result.pydantic.dict()['milestones']
df_milestones = pd.DataFrame(milestones)

# Display the DataFrame as an HTML table
df_milestones.style.set_table_attributes('border="1"').set_caption("Task Details").set_table_styles(
    [{'selector': 'th, td', 'props': [('font-size', '120%')]}]
)

Unnamed: 0,milestone_name,tasks
0,Design Completion,"['Create a Responsive Design', 'Create ""About Us"" Page', 'Design ""Services"" Page', 'Create ""Contact Us"" Page']"
1,Development Stages,"['Implement User Interface (UI)', 'Develop Navigation System', 'Implement Blog Section', 'Optimize for SEO and Loading Speed', 'Social Media Integration', 'Testimonials Section']"
2,Final Review,['Quality Assurance Testing']
3,Project Launch,['Deployment']
