In [1]:
from dotenv import load_dotenv

import torch
import lancedb
import re
import pandas as pd
from lancedb.embeddings import get_registry
from lancedb.pydantic import Vector, LanceModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

def get_device():
    return "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

bge_m3_model = get_registry().get("sentence-transformers").create(name="BAAI/bge-m3", device=get_device())
db = lancedb.connect("./lancedb")

In [3]:
# Find and convert string values in the 'vector' column to lists of floats
def convert_vector(vector):
    if isinstance(vector, str):
        return list(map(float, vector.strip('[]').split(',')))
    return vector

## Regulations and Ordinances

In [7]:
class OrdinanceSchema(LanceModel):
    text: str = bge_m3_model.SourceField()
    vector: Vector(dim=bge_m3_model.ndims()) = bge_m3_model.VectorField() # type: ignore
    lang: str
    cap_no: str
    section_no: str
    type: str
    url: str
    cap_title: str
    section_heading: str


In [8]:
table = db.create_table(
    name = "ordinances",
    schema = OrdinanceSchema,
) 

In [12]:
df = pd.read_csv(f"../legislations_chunks/en.csv")
df["vector"] = df["vector"].apply(convert_vector)
df["lang"] = "en"
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57588 entries, 0 to 57587
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cap_no           57588 non-null  object
 1   cap_title        52524 non-null  object
 2   section_no       57588 non-null  object
 3   section_heading  57588 non-null  object
 4   text             57588 non-null  object
 5   url              57588 non-null  object
 6   type             57588 non-null  object
 7   vector           57588 non-null  object
 8   lang             57588 non-null  object
dtypes: object(9)
memory usage: 4.0+ MB


In [11]:
df.head(5)

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang
0,140,Air Passenger Departure Tax Ordinance,1,Short title,Short title This Ordinance may be cited as the...,https://hklii.hk/en/legis/ord/140/s1,cap,"[0.009427718818187714, -0.002868852810934186, ...",en
1,140,Air Passenger Departure Tax Ordinance,3,Imposition of tax,Imposition of tax (1) Subject to sections 12 a...,https://hklii.hk/en/legis/ord/140/s3,cap,"[0.03467058017849922, -0.026947027072310448, -...",en
2,140,Air Passenger Departure Tax Ordinance,4,Collection of tax,Collection of tax A passenger liable to pay th...,https://hklii.hk/en/legis/ord/140/s4,cap,"[-0.0015317347133532166, -0.002358675701543688...",en
3,140,Air Passenger Departure Tax Ordinance,5,Records,Records (1) An operator shall maintain proper ...,https://hklii.hk/en/legis/ord/140/s5,cap,"[-0.002889419673010707, -0.005121254362165928,...",en
4,140,Air Passenger Departure Tax Ordinance,6,Returns,Returns (1) An operator shall furnish returns ...,https://hklii.hk/en/legis/ord/140/s6,cap,"[-0.005253892857581377, -0.01074348296970129, ...",en


In [14]:
df[df.isna().any(axis=1)]

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang
1731,635,,1,Short title,Short title (1) This Ordinance may be cited as...,https://hklii.hk/en/legis/ord/635/s1,cap,"[-0.00298016844317317, 0.018290787935256958, -...",en
1732,635,,3,Commission has legal personality,Commission has legal personality The Commissio...,https://hklii.hk/en/legis/ord/635/s3,cap,"[-0.0034544302616268396, 0.002420911332592368,...",en
1733,635,,4,Secretary may make regulations,Secretary may make regulations (1) The Secreta...,https://hklii.hk/en/legis/ord/635/s4,cap,"[0.001467714086174965, 0.0005900442483834922, ...",en
1734,635,,5,Regulations—general powers,Regulations—general powers (1) Without limitin...,https://hklii.hk/en/legis/ord/635/s5,cap,"[0.0123377600684762, 0.002910113660618663, -0....",en
1735,635,,6,Regulations—fees,Regulations—fees Regulations made under sectio...,https://hklii.hk/en/legis/ord/635/s6,cap,"[0.008501002565026283, 0.01841277815401554, -0...",en
...,...,...,...,...,...,...,...,...,...
57064,600,,14,Protection for public officers acting in good ...,Protection for public officers acting in good ...,https://hklii.hk/en/legis/ord/600/s14,cap,"[-0.019473524764180183, 0.022198833525180817, ...",en
57065,600,,15,Obstruction of public officers,Obstruction of public officers A person who re...,https://hklii.hk/en/legis/ord/600/s15,cap,"[0.02196319028735161, 0.0210091732442379, -0.0...",en
57066,600,,16,Power to make regulation,Power to make regulation The Secretary for Hea...,https://hklii.hk/en/legis/ord/600/s16,cap,"[0.004034498240798712, -0.01769905537366867, 0...",en
57067,600,,17,Specification of Authority and public officer,Specification of Authority and public officer ...,https://hklii.hk/en/legis/ord/600/s17,cap,"[-0.011297176592051983, -0.014234209433197975,...",en


In [26]:
new_cap_title = ""
for i, row in df[df.isna().any(axis=1)].iterrows():
    if "short title" in row["section_heading"].lower():
        title = re.search(r"This Ordinance may be cited as the (.+?)\.", row["text"])
        if title:
            title = title.group(1)
        else:
            title = re.search(r"This Ordinance may be cited as (.+?)\.", row["text"])
            title = title.group(1)
        new_cap_title = title
    df.at[i, "cap_title"] = new_cap_title

df[df.isna().any(axis=1)]

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang


In [27]:
df[1731:1735]

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang
1731,635,Conservation of Antarctic Marine Living Resour...,1,Short title,Short title (1) This Ordinance may be cited as...,https://hklii.hk/en/legis/ord/635/s1,cap,"[-0.00298016844317317, 0.018290787935256958, -...",en
1732,635,Conservation of Antarctic Marine Living Resour...,3,Commission has legal personality,Commission has legal personality The Commissio...,https://hklii.hk/en/legis/ord/635/s3,cap,"[-0.0034544302616268396, 0.002420911332592368,...",en
1733,635,Conservation of Antarctic Marine Living Resour...,4,Secretary may make regulations,Secretary may make regulations (1) The Secreta...,https://hklii.hk/en/legis/ord/635/s4,cap,"[0.001467714086174965, 0.0005900442483834922, ...",en
1734,635,Conservation of Antarctic Marine Living Resour...,5,Regulations—general powers,Regulations—general powers (1) Without limitin...,https://hklii.hk/en/legis/ord/635/s5,cap,"[0.0123377600684762, 0.002910113660618663, -0....",en


In [28]:
table.add(df)

In [29]:
table = db.open_table("ordinances")
table.create_fts_index("text", use_tantivy=False)

In [33]:
query = "Someone used a music that i created in a video and posted the video online, can i sue him"
docs = table.search(query=query, query_type="hybrid", fts_columns="text").limit(5).to_pandas()
docs

Unnamed: 0,text,vector,lang,cap_no,section_no,type,url,cap_title,section_heading,_relevance_score
0,Secondary infringement: provision of apparatus...,"[-0.026083633, 0.01828108, -0.036448967, 0.044...",en,528,34,cap,https://hklii.hk/en/legis/ord/528/s34,Copyright Ordinance,Secondary infringement: provision of apparatus...,0.016393
1,Meaning of journalistic material (1) Subject t...,"[-0.026645439, 0.00025719203, -0.029014125, 0....",en,1,82,cap,https://hklii.hk/en/legis/ord/1/s82,Interpretation and General Clauses Ordinance,Meaning of journalistic material,0.016393
2,". (2A) A person commits an offence if he, with...","[-0.028469197, 0.013512144, -0.038312923, 0.05...",en,528,118_2,cap,https://hklii.hk/en/legis/ord/528/s118,Copyright Ordinance,Offences in relation to making or dealing with...,0.016129
3,Temporary reproduction by service providers (1...,"[-0.032052238, 0.001961908, -0.015651632, 0.05...",en,528,65A,cap,https://hklii.hk/en/legis/ord/528/s65A,Copyright Ordinance,Temporary reproduction by service providers,0.016129
4,Incidental inclusion of copyright material (1)...,"[-0.020111954, 0.032726847, 0.002342802, 0.041...",en,528,40,cap,https://hklii.hk/en/legis/ord/528/s40,Copyright Ordinance,Incidental inclusion of copyright material,0.015873


## Judgements

In [4]:
import datetime

In [8]:
df = pd.read_parquet(f"../dataProcessing/generateCaseMetas/cases_meta_with_embeddings.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100149 entries, 0 to 100149
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   case_type       100149 non-null  object        
 1   crime_name      100149 non-null  object        
 2   case_summary    100149 non-null  object        
 3   court_decision  100149 non-null  object        
 4   case_causes     100149 non-null  object        
 5   case_evidence   100149 non-null  object        
 6   case_name       100149 non-null  object        
 7   court           100149 non-null  object        
 8   date            100149 non-null  datetime64[ns]
 9   citation        100149 non-null  object        
 10  case_number     100149 non-null  object        
 11  url             100149 non-null  object        
 12  summary_vector  100149 non-null  object        
dtypes: datetime64[ns](1), object(12)
memory usage: 10.7+ MB


In [9]:
df["summary_vector"] = df["summary_vector"].apply(convert_vector)
df["lang"] = "en"
df["date"] = pd.to_datetime(df["date"]).dt.date
df.head(5)

Unnamed: 0,case_type,crime_name,case_summary,court_decision,case_causes,case_evidence,case_name,court,date,citation,case_number,url,summary_vector,lang
0,civic,,This case involves a dispute over the valuatio...,The High Court dismissed the plaintiffs' appea...,The dispute arose over the interpretation and ...,Evidence presented at arbitration showed that ...,CHUN WO CONSTRUCTION &AMP; ENGINEERING CO LTD ...,hkca,2019-03-27,[2019] HKCA 369,CACV 431/2018,https://www.hklii.hk/en/cases/hkca/2019/369,"[-0.017158489674329758, 0.014291126281023026, ...",en
1,criminal,obtaining property by deception,"Reyes Edward S, one of the two defendants who ...",The applicant’s application for leave to appea...,Reyes Edward S and his co-defendant used a sto...,The applicant's co-defendant falsely represent...,HKSAR v. REYES EDWARD S,hkca,2019-03-28,[2019] HKCA 381,CACC 219/2018,https://www.hklii.hk/en/cases/hkca/2019/381,"[-0.013979041948914528, 0.0018763853004202247,...",en
2,criminal,abuse of process,The case involved a dispute between NG Yui Ton...,The Judge ruled that including Agenda 4 in the...,NG Yui Tong argued that including Agenda 4 vio...,The applicant requested discovery of audio rec...,NG YUI TONG v. TAIKOO SHING (MANAGEMENT) LTD,hkca,2019-03-27,[2019] HKCA 353,CAMP 172/2018,https://www.hklii.hk/en/cases/hkca/2019/353,"[-0.04085000231862068, 0.01448117010295391, -0...",en
3,criminal,disciplinary offense against a solicitor,"This case involves an appeal by a solicitor, M...",The High Court of Hong Kong Special Administra...,- Failing to file a Notice of Motion within ti...,- The appellant's failure to file a Notice of ...,A SOLICITOR v. THE LAW SOCIETY OF HONG KONG,hkca,2004-02-18,[2004] HKCA 112,CACV 302/2002,https://www.hklii.hk/en/cases/hkca/2004/112,"[-0.0473579540848732, -0.010266540572047234, -...",en
4,criminal,indecent assault,"Peter MacLennan, a Superintendent of police wi...",The Court of Appeal upheld the conviction and ...,The case was brought about by an alleged indec...,Madam Tam's testimony described the incident i...,ATTORNEY GENERAL v. PETER MACLENNAN,hkca,1996-08-07,[1996] HKCA 46,CAAR 16/1995,https://www.hklii.hk/en/cases/hkca/1996/46,"[-0.014021757058799267, -0.003302324330434203,...",en


In [10]:
class JudgementsSchema(LanceModel):
    case_summary: str = bge_m3_model.SourceField()
    summary_vector: Vector(dim=bge_m3_model.ndims()) = bge_m3_model.VectorField() # type: ignore
    case_type: str
    crime_name: str
    court_decision: str
    case_causes: str
    case_evidence: str
    lang: str
    court: str
    date: datetime.date
    case_name: str
    citation: str
    case_number: str
    url: str


In [14]:
table = db.create_table(
    name = "judgements",
    schema = JudgementsSchema,
    mode = "overwrite",
) 

In [15]:
table.add(df)

In [16]:
table = db.open_table("judgements")
table.create_fts_index("case_summary", use_tantivy=False)

In [18]:
query = "Someone used a music that i created in a video and posted the video online, can i sue him"
docs = table.search(query=query, query_type="hybrid", fts_columns="case_summary").limit(5).to_pandas()
docs

Unnamed: 0,case_summary,summary_vector,case_type,crime_name,court_decision,case_causes,case_evidence,lang,court,date,case_name,citation,case_number,url,_relevance_score
0,This case involves a dispute between Honger Mu...,"[-0.010542145, -0.034285534, -0.02712814, 0.02...",civic,,The judge found that the plaintiff was not unr...,The cause of action in this case is based on c...,The plaintiff provided evidence of the recordi...,en,hkcfi,2012-03-09,HONGER MUSIC VENTURE LTD v. LAM ANDREW,[2012] HKCFI 337,HCA 2092/2009,https://www.hklii.hk/en/cases/hkcfi/2012/337,0.016393
1,This case involves a scheme of arrangement pro...,"[0.00032095247, 0.013777306, -0.043671753, 0.0...",criminal,,The court sanctioned the scheme and made an or...,The Company was involved in legal disputes wit...,The evidence presented includes a sale and pur...,en,hkcfi,2008-12-11,RE MUSIC TRADING ON-LINE (HK) LTD,[2008] HKCFI 1102,HCMP 1541/2008,https://www.hklii.hk/en/cases/hkcfi/2008/1102,0.016393
2,This case involves a dispute between Universal...,"[-0.0037389733, 0.062849246, -0.02754063, 0.02...",civic,,"The Court dismissed both appeals, refusing to ...",The Plaintiffs alleged copyright infringement ...,The Plaintiffs provided evidence showing that ...,en,hkca,2013-02-19,"UMG RECORDINGS, INC. AND OTHERS v. PROFIT CHAR...",[2013] HKCA 85,CACV 262/2012,https://www.hklii.hk/en/cases/hkca/2013/85,0.016129
3,Cho Hung-Lung was convicted of four charges in...,"[-0.0076435893, -0.02691927, -0.030061522, 0.0...",criminal,"counterfeit video compact discs, music discs, ...",The trial judge sentenced Cho Hung-Lung to a t...,Cho Hung-Lung was convicted on four charges in...,The evidence presented included large numbers ...,en,hkca,1997-07-25,HKSAR v. CHOY HUNG LUNG,[1997] HKCA 120,CACC 227/1997,https://www.hklii.hk/en/cases/hkca/1997/120,0.016129
4,This case involves a dispute between Universal...,"[-0.0037389733, 0.062849246, -0.02754063, 0.02...",civic,,"The Court dismissed both appeals, refusing to ...",The Plaintiffs alleged copyright infringement ...,The Plaintiffs provided evidence of clear and ...,en,hkca,2013-02-19,"UMG RECORDINGS, INC. AND OTHERS v. PROFIT CHAR...",[2013] HKCA 93,CACV 263/2012,https://www.hklii.hk/en/cases/hkca/2013/93,0.015873
