In [1]:
import json
import requests
import pandas as pd
from openai import OpenAI
from bs4 import BeautifulSoup

In [3]:
system_prompt = """
You are a very helpful assistant. I will be passing you some information in Vietnamese and I will need your help in performing two tasks. First,
splitting a full name into given, middle, and last name. Second, identifying if the information I'm providing contains specific segments 
I'm interested in and, if true, structuring those segments in a JSON format for me. I will be providing more information regarding the JSON
schema after I pass you the information. Very important to note is that you will always have to perform the first task (name splitting), 
but the second task will only be performed in specific cases that I will flag to you.
"""

In [None]:
def get_context_prompt(only_name, full_name, bio):

    if only_name:

        prompt = f"""
        I will provide you a Vietnamese full name and I need your help in splitting this name into given name, middle name, and family name. 
        Additionally, I will need you to guess the gender (male or female) based on the name that I'm giving you. You will have to base your 
        guess in your 
        knowledge of Vietnamese naming conventions. 

        The Vietnamese name is: {full_name}.

        Please structure your answer following this JSON schema:
        {{
            "given_name"  : given name of the person from the full_name,
            "middle_name" : middle name of the person from the full_name,
            "family_name" : family name of the person from the full_name.
            "gender"      : gender based on the full name given.
        }}

        Please take into account the following:

        - If you are not able to provide an answer for any of these keys, please fill that key with a "UNCERTAIN" string for me to know that 
        it is not possible for you to know the answer to that specific field. 
        - You can answer with "NONE" if you believe that the name has no middle name.
        - Some strings passed as "full name" could be corrupted with additional information such as professional and academic titles such as 
        "lawyer" or "master". Please feel free to drop them and not include them in your answer.
        - Your answer should include ONLY the resulting JSON with your answers. Exclude any additional comments from the answer please.

        Thank you and you can begin now.
        """
    
    else:

        prompt = f"""
        I will provide you a Vietnamese full name and I need your help in splitting this name into given name, middle name, and family name. 
        Additionally, I will need you to guess the gender (male or female) based on the name that I'm giving you. You will have to base your guess in 
        your  knowledge of Vietnamese naming conventions. 

        The Vietnamese name is: {full_name}.

        Additionally, I have some biographic information available for this person. I will need you to read it carefully and assess if the information
        provided mentions any of the following fields:

        1. Areas of legal expertise that this person has, if mentioned in the provided text. For example, criminal, litigation, commercial, 
        civil, human rights, and so on.
        2. Languages spoken by the person, if mentioned.
        3. If the person is CURRENTLY working for the government or not.
        4. Years of experience, if mentioned.

        The biographic information that we have from this person is the following: {bio}

        Please structure your answer following this JSON schema:
        {{
            "given_name"       : given name of the person from the full_name,
            "middle_name"      : middle name of the person from the full_name,
            "family_name"      : family name of the person from the full_name,
            "gender"           : gender based on the full name given,
            "expertise"        : areas of legal expertise mentioned in the biographic information,
            "languages"        : languages spoken by the person, if any,
            "years_experience" : years of experience that this person has as a lawyer if it is mentioned in the biographic information,
            "public_servant"   : if the person is a public servant, answer or fill this field as TRUE, if the person is working as a privatre 
            lawyer please answer or fill this field as FALSE.
        }}

        Please take into account the following:

        - If you are not able to provide an answer for any of these keys, please fill that key with a "UNCERTAIN" string for me to know that 
        it is not possible for you to know the answer to that specific field. 
        - You can answer with "NONE" if you believe that the name has no middle name.
        - Some strings passed as "full name" could be corrupted with additional information such as professional and academic titles such as 
        "lawyer" or "master". Please feel free to drop them and not include them in your answer.
        - If there is no information in the biographic information that I passed to you that helps you answer or infere the fields of 
        expertise, languages, years of experience, and public servant, please answer or fill those fields with "NOT MENTIONED".
        - Your answer should include ONLY the resulting JSON with your answers. Exclude any additional comments from the answer please.

        Thank you and you can begin now.
        """
    
    return prompt

In [5]:
def process_info(full_name, url):

    print(f"Processing information for individual: {full_name}")

    response = requests.get(url)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, "lxml")

    try:
        bio_container = soup.find("div", class_ = "boxgen").find_all("p")
        bio_elements  = [p.text.strip() for p in bio_container]
        bio = "\n".join(bio_elements)
        status = "Profile processed"
    except AttributeError: # URL is not available
        print("Error... profile NOT found!!")
        bio = ""
        status = "Profile missing"

    if bio:
        context_prompt = get_context_prompt(
            only_name = False,
            full_name = full_name,
            bio = bio
        )
        print("Bio content found")

    else:
        context_prompt = get_context_prompt(
            only_name = True,
            full_name = full_name,
            bio = bio
        )
        print("Bio content NOT found")

    history = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": context_prompt}
    ]

    client = OpenAI(
        base_url = "http://localhost:1234/v1",
        api_key  = "sk-1234"
    )

    chat_completion = client.chat.completions.create(
        messages = history,
        model    = "deepseek-r1-distill-qwen-32b"
    )

    answer = chat_completion.choices[0].message.content
    answer_split = answer.split("</think>")

    think_content = answer_split[0].strip().replace("<think>\n", "")
    json_content  = answer_split[1].strip()
    json_content  = json_content.replace("json", "").replace("```", "").strip()

    results_json = json.loads(json_content)
    results_json["full_name"] = full_name
    results_json["url"] = url
    results_json["rsn"] = think_content
    results_json["status"] = status

    if not bio:
        results_json["expertise"]        = "No biography information"
        results_json["languages"]        = "No biography information"
        results_json["years_experience"] = "No biography information"
        results_json["public_servant"]   = "No biography information"
    
    print("=============================================================================")
    
    return results_json
        

In [11]:
data = pd.read_csv("../data/vietnam_danhbaluatsu.csv")
data = data.iloc[294:296]
target_values = dict(zip(data["full_name"], data["full_href"]))

In [12]:
processed_data = []
for name,href in target_values.items():
    r = process_info(name, href)
    processed_data.append(r)

Processing information for individual: VƯU VĂN KÍA
Bio content NOT found
Processing information for individual: TRẦN QUANG VINH
Bio content NOT found


In [14]:
df = pd.DataFrame(processed_data)
df

Unnamed: 0,given_name,middle_name,family_name,gender,full_name,url,rsn,status,expertise,languages,years_experience,public_servant
0,KÍA,VĂN,VƯU,UNCERTAIN,VƯU VĂN KÍA,https://www.danhbaluatsu.com/luat-su/vuu-van-kia/,"Okay, I need to split the Vietnamese name VƯU ...",Profile processed,No biography information,No biography information,No biography information,No biography information
1,Vinh,Quang,Trần,male,TRẦN QUANG VINH,https://www.danhbaluatsu.com/luat-su/tran-quan...,"Okay, so I need to help split a Vietnamese ful...",Profile processed,No biography information,No biography information,No biography information,No biography information


In [None]:
import pickle
with open("../data/vietnam_danhbaluatsu/vietnam_danhbaluatsu_partial.pkl", "rb") as f:
    partial_data = pickle.load(f)

df = pd.DataFrame(partial_data)
df

In [None]:
resume_person = "TRẦN VĂN AN"