In [30]:
import re
import os
import requests
from openai import OpenAI
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

True

In [9]:
def transcribe_name(name):

    client = OpenAI(
        api_key = os.getenv('OPENAI_API_KEY')
    )
    
    history = [
        {
            "role": "system", 
            "content": """
            You are a very helpful assistant. I need your help to transcribe a name from Mongolian Cyrillic to the Latin alphabet. 
            """
        },
        {
            "role": "user",   
            "content": f"""
            Please transcribe the following name from the Mongolian Cyrillic alphabet to the Latin alphabet: {name}.
            Return ONLY the transcribed name and refrained from providing any additional information.
            """
        }
    ]
    chat_completion = client.chat.completions.create(
        messages = history,
        model    = "gpt-4o-2024-08-06"
    )
    feedback = chat_completion.choices[0].message.content

    return feedback

In [14]:
def translate_practices(practices):

    client = OpenAI(
        api_key = os.getenv('OPENAI_API_KEY')
    )
    
    history = [
        {
            "role": "system", 
            "content": """
            You are a very helpful assistant. I need your help to translate a set of legal areas from Mongolian to English. 
            """
        },
        {
            "role": "user",   
            "content": f"""
            Please translate the following legal areas from Mongolian to English: {practices}.
            Return ONLY the translated text and refrained from providing any additional information.
            """
        }
    ]
    chat_completion = client.chat.completions.create(
        messages = history,
        model    = "gpt-4o-2024-08-06"
    )
    feedback = chat_completion.choices[0].message.content

    return feedback

In [33]:
def get_lawyer_info(lawyer_href):
    
    lawyer_url = f'https://www.mglbar.mn/{lawyer_href}'

    # print(f'Processing: {lawyer_url}')

    r = requests.get(lawyer_url)
    s = BeautifulSoup(r.content, 'lxml')

    full_name = s.find('div', class_ = 'lawyer-view-name').text.strip().replace(' - ', ', ')
    full_name_transcribed = transcribe_name(full_name)

    try:
        email = s.find('span', string = re.compile('И-мэйл хаяг:')).find_next_sibling('input').get('value')
    except AttributeError:
        email = 'Not Foud'

    try:
        phone = s.find('span', string = re.compile('Утас:')).find_next_sibling('input').get('value')
    except AttributeError:
        phone = 'Not Found'
    
    try:
        practice_list = s.find('div', class_ = 'uk-margin-medium-top').find_all('div', class_ = 'mb-20')
        practice = ', '.join([item.text.strip() for item in practice_list])
        practice_translated = translate_practices(practice)

    except AttributeError:
        practice = 'Not Found'
        practice_translated = 'Not Found'

    lawyer_entry = {
        "country"         : "Mongolia",
        "title"           : "Өмгөөлөгч",
        "title_translated": "Lawyer",
        "generic_title"   : True,
        "full_name"       : full_name,
        "full_name_transcribed": full_name_transcribed,
        "gender"          : "NA",
        "email"           : email,
        "languages"       : "NA",
        "position"        : "NA",
        "organization"    : "NA",
        "phone"           : phone,
        "mobile"          : "NA",
        "practice"        : practice,
        "practice_translated" : practice_translated,
        "full_href"       : lawyer_url
    }

    return lawyer_entry


In [None]:
results = []

In [34]:
for page in range(2, 106):

    print(f'Extracting results from page {page}...')
    url = f'https://www.mglbar.mn/lawyer/advocate?page={page}&size=20'
    
    r = requests.get(url)
    s = BeautifulSoup(r.content, 'lxml')

    lawyer_cards = s.find('div', class_ = 'width-1-1 mt-40 mb-40 visible').find_all('div', class_ = 'lawyer-item-box mb-20')

    for card in lawyer_cards:
        lawyer_href = card.find('div', class_ = 'overflow-2').find('a').get('href')
        lawyer_data = get_lawyer_info(lawyer_href)
        results.append(lawyer_data)

Extracting results from page 2...
Extracting results from page 3...
Extracting results from page 4...
Extracting results from page 5...
Extracting results from page 6...
Extracting results from page 7...
Extracting results from page 8...
Extracting results from page 9...
Extracting results from page 10...
Extracting results from page 11...
Extracting results from page 12...
Extracting results from page 13...
Extracting results from page 14...
Extracting results from page 15...
Extracting results from page 16...
Extracting results from page 17...
Extracting results from page 18...
Extracting results from page 19...
Extracting results from page 20...
Extracting results from page 21...
Extracting results from page 22...
Extracting results from page 23...
Extracting results from page 24...
Extracting results from page 25...
Extracting results from page 26...
Extracting results from page 27...
Extracting results from page 28...
Extracting results from page 29...
Extracting results from page

In [37]:
df = pd.DataFrame(results)
master_data = df.copy()

In [47]:
master_data.loc[master_data['practice'] == '', 'practice_translated'] = ''
master_data.to_csv('../data/mongolia_mglbar.csv', index = False, encoding='utf-8')