In [8]:
import requests
from bs4 import BeautifulSoup
import json
import time


In [9]:

def Bo_to_En_translation(url: str):
    """
    Extracts all sentence links from a given glosbe tib to eng sentence page.

    This function scrapes the provided URL and extracts links to individual to senetence
    found on the page.

    Args:
    url (str): The URL of the tib to eng sentence links

    Returns:
        {
            "Links": List[],
            "Words": [],
            "Message": string,
            "Response": int,
            "source_url": string
        }
    Raises:
    requests.RequestException: If there's an error fetching the webpage.
    ValueError: If the expected HTML structure is not found on the page.
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    final_response = {
        "Links": [],
        "Words": [],
        "Message": "Success",
        "Response": 200,
        "source_url": url
    }
    
    try:
        start_time = time.time()
        response = requests.get(url, headers=headers, timeout=(5, 60-5))
        response.raise_for_status()
        end_time = time.time()

        if end_time-start_time > 50:
            print(f"This ULR Took more then 50s: {url}")
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # # Getting all the links of articles 
        all_links = []
        all_words = []
        base_link = "https://app.glosbe.com/"

        # Extracting all the articles in the DIV
        all_top_senetence = soup.find("div", class_="page-topwords-list")
        if all_top_senetence:
            all_sentence_link = all_top_senetence.find_all("a")
            for link in all_sentence_link:
                full_link = base_link+link.get("href")
                all_links.append(full_link)
                all_words.append(link.get_text(strip=True))

        
        final_response["Links"] = all_links
        final_response["Words"] = all_words
        return final_response
     
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408  # Request Timeout
        return final_response
    except requests.RequestException as e:
        # print(f"An error occurred while fetching the webpage: {e}")
        final_response["Message"] = f"An error occurred while fetching the webpage: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', None)
        return final_response
    except ValueError as e:
        # print(f"An error occurred while parsing the webpage: {e}")
        final_response["Message"] = f"An error occurred while parsing the webpage: {e}"
        final_response["Response"] = 404
        # getattr(e.response, 'status_code', None)
        return final_response
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        final_response["Message"] = f"An unexpected error occurred: {e}"
        final_response["Response"] = 500
        return final_response




In [10]:
url = "https://app.glosbe.com/bo/en/_topwords/1-1000"

each_senetence_link = Bo_to_En_translation(url)


In [11]:
def save_json(path, file_name, data):
    """
    
    """
    with open(path+file_name, "w") as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)
        print(f"Successfully saved: {file_name}")

In [12]:
path = "./data/"
file_name = "glosbe_top_words_bo_en.json"
save_json(path, file_name, each_senetence_link)

Successfully saved: glosbe_top_words_bo_en.json
