In [19]:
from dotenv import load_dotenv

import torch
import lancedb
import re
import pandas as pd
from lancedb.embeddings import get_registry
from lancedb.pydantic import Vector, LanceModel

In [2]:
load_dotenv()

def get_device():
    return "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

bge_m3_model = get_registry().get("sentence-transformers").create(name="BAAI/bge-m3", device=get_device())
db = lancedb.connect("./lancedb")

In [None]:
# Find and convert string values in the 'vector' column to lists of floats
def convert_vector(vector):
    if isinstance(vector, str):
        return list(map(float, vector.strip('[]').split(',')))
    return vector

## Regulations and Ordinances

In [7]:
class OrdinanceSchema(LanceModel):
    text: str = bge_m3_model.SourceField()
    vector: Vector(dim=bge_m3_model.ndims()) = bge_m3_model.VectorField() # type: ignore
    lang: str
    cap_no: str
    section_no: str
    type: str
    url: str
    cap_title: str
    section_heading: str


In [8]:
table = db.create_table(
    name = "ordinances",
    schema = OrdinanceSchema,
) 

In [12]:
df = pd.read_csv(f"../legislations_chunks/en.csv")
df["vector"] = df["vector"].apply(convert_vector)
df["lang"] = "en"
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57588 entries, 0 to 57587
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cap_no           57588 non-null  object
 1   cap_title        52524 non-null  object
 2   section_no       57588 non-null  object
 3   section_heading  57588 non-null  object
 4   text             57588 non-null  object
 5   url              57588 non-null  object
 6   type             57588 non-null  object
 7   vector           57588 non-null  object
 8   lang             57588 non-null  object
dtypes: object(9)
memory usage: 4.0+ MB


In [11]:
df.head(5)

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang
0,140,Air Passenger Departure Tax Ordinance,1,Short title,Short title This Ordinance may be cited as the...,https://hklii.hk/en/legis/ord/140/s1,cap,"[0.009427718818187714, -0.002868852810934186, ...",en
1,140,Air Passenger Departure Tax Ordinance,3,Imposition of tax,Imposition of tax (1) Subject to sections 12 a...,https://hklii.hk/en/legis/ord/140/s3,cap,"[0.03467058017849922, -0.026947027072310448, -...",en
2,140,Air Passenger Departure Tax Ordinance,4,Collection of tax,Collection of tax A passenger liable to pay th...,https://hklii.hk/en/legis/ord/140/s4,cap,"[-0.0015317347133532166, -0.002358675701543688...",en
3,140,Air Passenger Departure Tax Ordinance,5,Records,Records (1) An operator shall maintain proper ...,https://hklii.hk/en/legis/ord/140/s5,cap,"[-0.002889419673010707, -0.005121254362165928,...",en
4,140,Air Passenger Departure Tax Ordinance,6,Returns,Returns (1) An operator shall furnish returns ...,https://hklii.hk/en/legis/ord/140/s6,cap,"[-0.005253892857581377, -0.01074348296970129, ...",en


In [14]:
df[df.isna().any(axis=1)]

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang
1731,635,,1,Short title,Short title (1) This Ordinance may be cited as...,https://hklii.hk/en/legis/ord/635/s1,cap,"[-0.00298016844317317, 0.018290787935256958, -...",en
1732,635,,3,Commission has legal personality,Commission has legal personality The Commissio...,https://hklii.hk/en/legis/ord/635/s3,cap,"[-0.0034544302616268396, 0.002420911332592368,...",en
1733,635,,4,Secretary may make regulations,Secretary may make regulations (1) The Secreta...,https://hklii.hk/en/legis/ord/635/s4,cap,"[0.001467714086174965, 0.0005900442483834922, ...",en
1734,635,,5,Regulations—general powers,Regulations—general powers (1) Without limitin...,https://hklii.hk/en/legis/ord/635/s5,cap,"[0.0123377600684762, 0.002910113660618663, -0....",en
1735,635,,6,Regulations—fees,Regulations—fees Regulations made under sectio...,https://hklii.hk/en/legis/ord/635/s6,cap,"[0.008501002565026283, 0.01841277815401554, -0...",en
...,...,...,...,...,...,...,...,...,...
57064,600,,14,Protection for public officers acting in good ...,Protection for public officers acting in good ...,https://hklii.hk/en/legis/ord/600/s14,cap,"[-0.019473524764180183, 0.022198833525180817, ...",en
57065,600,,15,Obstruction of public officers,Obstruction of public officers A person who re...,https://hklii.hk/en/legis/ord/600/s15,cap,"[0.02196319028735161, 0.0210091732442379, -0.0...",en
57066,600,,16,Power to make regulation,Power to make regulation The Secretary for Hea...,https://hklii.hk/en/legis/ord/600/s16,cap,"[0.004034498240798712, -0.01769905537366867, 0...",en
57067,600,,17,Specification of Authority and public officer,Specification of Authority and public officer ...,https://hklii.hk/en/legis/ord/600/s17,cap,"[-0.011297176592051983, -0.014234209433197975,...",en


In [26]:
new_cap_title = ""
for i, row in df[df.isna().any(axis=1)].iterrows():
    if "short title" in row["section_heading"].lower():
        title = re.search(r"This Ordinance may be cited as the (.+?)\.", row["text"])
        if title:
            title = title.group(1)
        else:
            title = re.search(r"This Ordinance may be cited as (.+?)\.", row["text"])
            title = title.group(1)
        new_cap_title = title
    df.at[i, "cap_title"] = new_cap_title

df[df.isna().any(axis=1)]

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang


In [27]:
df[1731:1735]

Unnamed: 0,cap_no,cap_title,section_no,section_heading,text,url,type,vector,lang
1731,635,Conservation of Antarctic Marine Living Resour...,1,Short title,Short title (1) This Ordinance may be cited as...,https://hklii.hk/en/legis/ord/635/s1,cap,"[-0.00298016844317317, 0.018290787935256958, -...",en
1732,635,Conservation of Antarctic Marine Living Resour...,3,Commission has legal personality,Commission has legal personality The Commissio...,https://hklii.hk/en/legis/ord/635/s3,cap,"[-0.0034544302616268396, 0.002420911332592368,...",en
1733,635,Conservation of Antarctic Marine Living Resour...,4,Secretary may make regulations,Secretary may make regulations (1) The Secreta...,https://hklii.hk/en/legis/ord/635/s4,cap,"[0.001467714086174965, 0.0005900442483834922, ...",en
1734,635,Conservation of Antarctic Marine Living Resour...,5,Regulations—general powers,Regulations—general powers (1) Without limitin...,https://hklii.hk/en/legis/ord/635/s5,cap,"[0.0123377600684762, 0.002910113660618663, -0....",en


In [28]:
table.add(df)

In [29]:
table = db.open_table("ordinances")
table.create_fts_index("text", use_tantivy=False)

In [33]:
query = "Someone used a music that i created in a video and posted the video online, can i sue him"
docs = table.search(query=query, query_type="hybrid", fts_columns="text").limit(5).to_pandas()
docs

Unnamed: 0,text,vector,lang,cap_no,section_no,type,url,cap_title,section_heading,_relevance_score
0,Secondary infringement: provision of apparatus...,"[-0.026083633, 0.01828108, -0.036448967, 0.044...",en,528,34,cap,https://hklii.hk/en/legis/ord/528/s34,Copyright Ordinance,Secondary infringement: provision of apparatus...,0.016393
1,Meaning of journalistic material (1) Subject t...,"[-0.026645439, 0.00025719203, -0.029014125, 0....",en,1,82,cap,https://hklii.hk/en/legis/ord/1/s82,Interpretation and General Clauses Ordinance,Meaning of journalistic material,0.016393
2,". (2A) A person commits an offence if he, with...","[-0.028469197, 0.013512144, -0.038312923, 0.05...",en,528,118_2,cap,https://hklii.hk/en/legis/ord/528/s118,Copyright Ordinance,Offences in relation to making or dealing with...,0.016129
3,Temporary reproduction by service providers (1...,"[-0.032052238, 0.001961908, -0.015651632, 0.05...",en,528,65A,cap,https://hklii.hk/en/legis/ord/528/s65A,Copyright Ordinance,Temporary reproduction by service providers,0.016129
4,Incidental inclusion of copyright material (1)...,"[-0.020111954, 0.032726847, 0.002342802, 0.041...",en,528,40,cap,https://hklii.hk/en/legis/ord/528/s40,Copyright Ordinance,Incidental inclusion of copyright material,0.015873


## Judgements

In [35]:
import datetime

In [38]:
class JudgementsSchema(LanceModel):
    text: str = bge_m3_model.SourceField()
    vector: Vector(dim=bge_m3_model.ndims()) = bge_m3_model.VectorField() # type: ignore
    lang: str
    court: str
    date: datetime.date
    case_name: str
    citation: str
    case_number: str
    law_report_citations: list[str]
    url: str


In [39]:
table = db.create_table(
    name = "judgements",
    schema = JudgementsSchema,
) 

In [40]:
df = pd.read_csv(f"../judgements_chunks/eng_cases.csv")
df["vector"] = df["vector"].apply(convert_vector)
df["lang"] = "en"
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1367117 entries, 0 to 1367116
Data columns (total 10 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   court                 1367117 non-null  object
 1   date                  1367117 non-null  object
 2   case_name             1367117 non-null  object
 3   citation              1367117 non-null  object
 4   case_number           1367117 non-null  object
 5   law_report_citations  1367117 non-null  object
 6   url                   1367117 non-null  object
 7   content               1367117 non-null  object
 8   vector                1367117 non-null  object
 9   lang                  1367117 non-null  object
dtypes: object(10)
memory usage: 104.3+ MB


In [46]:
# df["text"] = df["content"]
df["date"] = pd.to_datetime(df["date"]).dt.date
# df.drop(columns=["content"], inplace=True)
df.head(5)

Unnamed: 0,court,date,case_name,citation,case_number,law_report_citations,url,vector,lang,text
0,hkca,2018-12-10,RE MOJUMDER SONJOY,[2018] HKCA 925_0,CACV 235/2018,[],https://www.hklii.hk/en/cases/hkca/2018/925,"[-0.03703349083662033, 0.013102861121296883, -...",en,| CACV 235/2018 \[2018] HKCA 925 **IN THE HIGH...
1,hkca,2018-12-10,RE MOJUMDER SONJOY,[2018] HKCA 925_1,CACV 235/2018,[],https://www.hklii.hk/en/cases/hkca/2018/925,"[-0.006418189499527216, 0.017247360199689865, ...",en,". 3\. The applicant subsequently applied, by a..."
2,hkca,2018-12-10,RE MOJUMDER SONJOY,[2018] HKCA 925_2,CACV 235/2018,[],https://www.hklii.hk/en/cases/hkca/2018/925,"[-0.004030972253531218, 0.008936459198594093, ...",en,| --- | --- | --- | --- | --- | --- | --- | --...
3,hkca,2018-12-10,RE CHELLAM GOVINDAN,[2018] HKCA 916_0,CACV 282/2018,[],https://www.hklii.hk/en/cases/hkca/2018/916,"[-0.05869578942656517, 0.00040816247928887606,...",en,| CACV 282/2018 \[2018] HKCA 916 **IN THE HIGH...
4,hkca,2018-12-10,RE CHELLAM GOVINDAN,[2018] HKCA 916_1,CACV 282/2018,[],https://www.hklii.hk/en/cases/hkca/2018/916,"[-0.0023257120046764612, 0.0043684346601367, -...",en,. The applicant is a national of India. He ent...


In [47]:
table.add(df)

In [48]:
table = db.open_table("judgements")
table.create_fts_index("text", use_tantivy=False)

In [49]:
query = "Someone used a music that i created in a video and posted the video online, can i sue him"
docs = table.search(query=query, query_type="hybrid", fts_columns="text").limit(5).to_pandas()
docs

Unnamed: 0,text,vector,lang,court,date,case_name,citation,case_number,law_report_citations,url,_relevance_score
0,. I agree that ordinary people may not think o...,"[-0.020229895, 0.029456206, -0.021092607, 0.02...",en,hkcfi,2004-05-06,HKSAR v. LAU YING WAI,[2004] HKCFI 297_3,HCMA 861/2003,"[[, ]]",https://www.hklii.hk/en/cases/hkcfi/2004/297,0.016393
1,. Material facts 11\. The facts are set out in...,"[0.01819038, 0.01863793, -0.04834758, 0.007260...",en,hkcfi,2023-07-28,SECRETARY FOR JUSTICE v. PERSONS CONDUCTING TH...,[2023] HKCFI 1950_3,HCA 855/2023,"[[, ', [, 2, 0, 2, 3, ], , 5, , H, K, C, , ...",https://www.hklii.hk/en/cases/hkcfi/2023/1950,0.016393
2,. These questions arise in the context of the ...,"[-0.024691418, -0.0008660974, 0.018014593, 0.0...",en,hkcfa,2007-05-18,CHAN NAI MING v. HKSAR,[2007] HKCFA 36_1,FACC 3/2007,"[[, ', [, 2, 0, 0, 7, ], , 2, , H, K, L, R, ...",https://www.hklii.hk/en/cases/hkcfa/2007/36,0.016129
3,. The files were current files with no record ...,"[-0.020476067, 0.013066936, -0.0626201, 0.0121...",en,hkdc,2019-11-25,HKSAR v. CHENG HO WING,[2019] HKDC 1333_70,DCCC 443/2018,"[[, ]]",https://www.hklii.hk/en/cases/hkdc/2019/1333,0.016129
4,". We will refer to this sound recording as ""th...","[-0.029354803, 0.03882698, -0.045285203, 0.050...",en,hkcfa,2003-12-19,TSE MUI CHUN v. HKSAR,[2003] HKCFA 12_5,FACC 4/2003,"[[, ', [, 2, 0, 0, 4, ], , 1, , H, K, L, R, ...",https://www.hklii.hk/en/cases/hkcfa/2003/12,0.015873
