In [1]:
import pandas as pd

In [2]:
questions = pd.read_json("../lqb/high_quality_questions.json")
print(len(questions))

# Convert 'Answered in text?' column from string representation to actual list
questions["Answered in text?"] = questions["Answered in text?"].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith('[') and x.endswith(']') else ([x] if isinstance(x, str) else x))

questions.info()
questions.head(2)

30474
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30474 entries, 0 to 30473
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Question             30474 non-null  object
 1   Simplified_Chinese   30474 non-null  object
 2   Traditional_Chinese  30474 non-null  object
 3   URL                  30474 non-null  object
 4   page                 30474 non-null  int64 
 5   index                30474 non-null  int64 
 6   Topic                30474 non-null  object
 7   Identifier           30474 non-null  object
 8   Method               30474 non-null  object
 9   Answered in text?    30474 non-null  object
 10  Interesting Q        30474 non-null  object
 11  File                 30474 non-null  object
dtypes: int64(2), object(10)
memory usage: 2.8+ MB


Unnamed: 0,Question,Simplified_Chinese,Traditional_Chinese,URL,page,index,Topic,Identifier,Method,Answered in text?,Interesting Q,File
0,Which ordinance prescribes the scheme for comp...,哪个法令规定了强制机动车辆保险的方案？,哪個法令規定了強制機動車輛保險的方案？,https://clic.org.hk/en/topics/insurance/common...,746,281,insurance,746-S300Q1,Human,[746-S300-P1],No,CLIC_data_batch_5_total/7. Chan Chun Wing Marc...
1,What does Section 4(1) of the Motor Vehicles I...,《机动车辆保险（第三者责任）条例》第4(1)条规定了什么内容？,《機動車輛保險（第三者責任）條例》第4(1)條規定了什麼內容？,https://clic.org.hk/en/topics/insurance/common...,746,282,insurance,746-S300Q2,Human,[746-S300-P1],No,CLIC_data_batch_5_total/7. Chan Chun Wing Marc...


In [3]:
answer_mapping = pd.read_json("../lqb/scope_to_text.json", orient="index")
answer_mapping = answer_mapping.reset_index()
answer_mapping.columns = ["ChunkIdentifier", "Answer"]
answer_mapping.head(2)

Unnamed: 0,ChunkIdentifier,Answer
0,874-S000,I heard about someone who claimed that they we...
1,874-S100-P1,The Land Registry provides a “Land Search” ser...


In [4]:
def get_answers(chunk_identifiers):
    matching_answers = []
    for chunk_id in chunk_identifiers:
        matches = answer_mapping[answer_mapping["ChunkIdentifier"] == chunk_id]
        if not matches.empty:
            matching_answers.append(matches["Answer"].values[0])
    return "\n\n".join(matching_answers)

questions["Answer"] = questions["Answered in text?"].apply(get_answers)

questions.head(2)

Unnamed: 0,Question,Simplified_Chinese,Traditional_Chinese,URL,page,index,Topic,Identifier,Method,Answered in text?,Interesting Q,File,Answer
0,Which ordinance prescribes the scheme for comp...,哪个法令规定了强制机动车辆保险的方案？,哪個法令規定了強制機動車輛保險的方案？,https://clic.org.hk/en/topics/insurance/common...,746,281,insurance,746-S300Q1,Human,[746-S300-P1],No,CLIC_data_batch_5_total/7. Chan Chun Wing Marc...,C. Compulsory Motor Insurance. 1. Motor Vehicl...
1,What does Section 4(1) of the Motor Vehicles I...,《机动车辆保险（第三者责任）条例》第4(1)条规定了什么内容？,《機動車輛保險（第三者責任）條例》第4(1)條規定了什麼內容？,https://clic.org.hk/en/topics/insurance/common...,746,282,insurance,746-S300Q2,Human,[746-S300-P1],No,CLIC_data_batch_5_total/7. Chan Chun Wing Marc...,C. Compulsory Motor Insurance. 1. Motor Vehicl...


In [5]:
df = questions[["Question", "Answer", "URL"]]
df.head(2)

Unnamed: 0,Question,Answer,URL
0,Which ordinance prescribes the scheme for comp...,C. Compulsory Motor Insurance. 1. Motor Vehicl...,https://clic.org.hk/en/topics/insurance/common...
1,What does Section 4(1) of the Motor Vehicles I...,C. Compulsory Motor Insurance. 1. Motor Vehicl...,https://clic.org.hk/en/topics/insurance/common...


In [9]:
df_caps = df[df["Answer"].str.contains("Cap.") & 
    ~df["Question"].str.contains("ordinance|regulation|Ordinance|Regulation", case=False)]
print(len(df_caps))
df_caps.head(2)
df_caps_sample = df_caps.sample(n=100, random_state=42)
df_json = df_caps_sample.to_json(orient="records", indent=2).replace('\\/', '/')
with open("../lqb/cap_qa_pairs.json", "w") as f:
    f.write(df_json)

2919


In [11]:
# Create batch API request to extract ordinance references
batch_requests = []
for _, row in df_caps_sample.iterrows():
    batch_requests.append({
        "custom_id": "task-" + str(row.name),
        "method": "POST",
        "url": "/v4/chat/completions",
        "body": {
            "model": "glm-4-plus",  # Or your preferred model
            "messages": [
                {"role": "system", "content": "Extract all ordinance and regulation references from the text. Return a JSON array with objects containing 'cap_no' and 'section_no' for each reference. Example: [{\"cap_no\": \"155\", \"section_no\": \"12\"}, {\"cap_no\": \"32A\", \"section_no\": \"5\"}]"},
                {"role": "user", "content": row["Answer"]}
            ],
            "response_format": {"type": "json_object"},
            "temperature": 0.1,
        }
    })

# Write batch requests to JSONL file
import json
with open("../lqb/ordinance_extraction_batch.jsonl", "w") as f:
    for req in batch_requests:
        f.write(json.dumps(req) + "\n")




In [None]:
ordinance_results = {}
with open("../lqb/ordinance_batch_output.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        custom_id = data["custom_id"]
        task_id = custom_id.split("-")[1]  # Extract the task ID number
        
        try:
            content = data["response"]["body"]["choices"][0]["message"]["content"]
            # Parse the JSON string in the content
            ordinance_results[task_id] = json.loads(content)
        except (KeyError, json.JSONDecodeError) as e:
            print(f"Error processing line for task {task_id}: {e}")
            ordinance_results[task_id] = []

# Add the extracted ordinances to df_caps_sample
df_caps_sample["ordinances"] = df_caps_sample.index.map(
    lambda idx: ordinance_results.get(str(idx), [])
)

# Save the updated DataFrame with ordinances
df_json_with_ordinances = df_caps_sample.to_json(orient="records", indent=2).replace('\\/', '/')
with open("cap_qa_pairs_with_ordinances.json", "w") as f:
    f.write(df_json_with_ordinances)


# Print a sample to verify
print("\nSample with extracted ordinances:")
print(df_caps_sample[["Question", "ordinances"]].head(3))


Sample with extracted ordinances:
                                                Question  \
26648  How does the Hospital Authority differentiate ...   
27009  If I leave no will, how will my assets be dist...   
24513  How is the maximum amount of remission calcula...   

                                              ordinances  
26648  [{'cap_no': '177', 'section_no': 'Registration...  
27009  [{'cap_no': '73', 'section_no': '4(3)'}, {'cap...  
24513  [{'cap_no': '234A', 'section_no': '69(2)'}, {'...  


In [8]:
df_cases = df[df["Answer"].str.contains(r"\([A-Z]{4}\s+\d+/\d+\)|\b[A-Z]{4}\s+\d+/\d+\b")]
print(len(df_cases))
df_cases.head(2)
# df_cases_sample = df_cases.sample(n=100, random_state=42)
df_json = df_cases.to_json(orient="records", indent=2, force_ascii=False).replace('\\/', '/')
with open("case_qa_pairs.json", "w") as f:
    f.write(df_json)

32


In [55]:
df_cases = df[df["answer"].str.contains("case law")]
print(len(df_cases))
df_cases.iloc[0]["Question"]
# # df_cases_sample = df_cases.sample(n=100, random_state=42)
# df_json = df_cases.to_json(orient="records", indent=2, force_ascii=False).replace('\\/', '/')
# with open("lqb/case_qa_pairs.json", "w") as f:
#     f.write(df_json)

57


'Can decisions of a public authority be challenged by ordinary legal actions?'