In [1]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
import time
import json

# Read each page of linguatools

In [23]:
def safe_extract(element, default=''):
    return element.text.strip() if element else default

def scrape_linguatools_translation(url: str) -> Dict[str, Any]:
    """
    Scrapes an article from the Tibetan English Czech (linguatools) website.

    Args:
    url (str): The URL of the linguatools to scrape.

    Returns:
    Dict[str, Any]: A dictionary containing the scraped information and status details.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    each_data_empty = {
        "English": {
                'Word': "",
                'POS': "",
                'Sentence': ""
            },
            "Tibetan": {
                'Word': "",
                'phonetic': "",
                'Sentence': ""
            },
            "czech": {
                'Word': "",
                'Sentence': ""
            },
            "meta_data": {
                "Comment": "",
                "Source": ""
            },
            "Message": ""
    }
    
    final_response = {
        "data": "",
        "Message": "Success",
        "Response": 200
    }
    
    
    try:
        response = requests.get(url, headers=headers, timeout=120)
        response.raise_for_status()
        All_soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract body
        Table_body = All_soup.find('tbody', id='entries')
        if Table_body:
            all_table_rows = Table_body.find_all("tr")
            try:
                All_data = {}
                index = 0
                for soup in all_table_rows:
                    try:
                        each_data = {
                            "English": {
                                'Word': safe_extract(soup.find('b')),
                                'POS': safe_extract(soup.find('span', class_='text-muted').find('small') if soup.find('span', class_='text-muted') else None),
                                'Sentence': safe_extract(soup.find_all('small')[1] if len(soup.find_all('small')) > 1 else None)
                            },
                            "Tibetan": {
                                'Word': safe_extract(soup.find('span', style='font-size:120%;').find('b') if soup.find('span', style='font-size:120%;') else None),
                                'phonetic': safe_extract(soup.find_all('span', class_='text-muted')[1].find('small') if len(soup.find_all('span', class_='text-muted')) > 1 else None).strip('[]'),
                                'Sentence': safe_extract(soup.find_all('span', style='font-size:120%;')[1] if len(soup.find_all('span', style='font-size:120%;')) > 1 else None)
                            },
                            "czech": {
                                'Word': safe_extract(soup.find_all('td')[2].find('b') if len(soup.find_all('td')) > 2 else None),
                                'Sentence': safe_extract(soup.find_all('td')[2].find('small') if len(soup.find_all('td')) > 2 else None)
                            },
                            "meta_data": {
                                "Comment": safe_extract(soup.find_all('td')[3] if len(soup.find_all('td')) > 3 else None),
                                "Source": safe_extract(soup.find_all('span', class_='text-muted')[-1] if soup.find_all('span', class_='text-muted') else None)
                            },
                            "Message": "Success", 
                        }
                        All_data[index] = each_data
                        index += 1
                    except Exception as e:
                        each_data_empty['Message'] = str(e)
                        All_data[index] = each_data_empty
                        index += 1
                        continue
                final_response['data'] = All_data
            except:
                print("failed")
            return final_response        
        
    
    except requests.Timeout:
        final_response["Message"] = "Request timed out"
        final_response["Response"] = 408
        return final_response
        
    except requests.RequestException as e:
        final_response["Message"] = f"An error occurred while fetching the linguatools: {e}"
        final_response["Response"] = getattr(e.response, 'status_code', 500)
        return final_response
    except Exception as e:
        final_response["Message"] = f"An error occurred in code: {e}"
        final_response["Response"] = 504
        return final_response

    



In [3]:
def save_json(path, file_name, data):
    """
    
    """
    with open(path+file_name, "w") as outfile:
        
        json.dump(data, outfile, indent=4, ensure_ascii=False)
        print(f"Successfully saved: {file_name}")

In [4]:
def read_json(path, file_name):
    """
    
    """
    with open(path+file_name, 'r') as openfile:
        # Reading from json file
        Loaded_file = json.load(openfile)
        print(f"Successfully loaded: {file_name}")

    return Loaded_file


## Runing lingtool page 1 with 5000 rows

In [30]:
url = "https://linguatools.info/?page=1&per_page=5000"
linguatools_page1 = scrape_linguatools_translation(url)

In [31]:
len(linguatools_page1["data"])

5000

In [35]:
index = 0

for keys in linguatools_page1["data"]:
    # print(keys)
    if linguatools_page1["data"][keys]["Message"] != "Success":
        print(keys, linguatools_page1[keys]["Message"])
        index += 1

print(f"Total error in extracting the page 1 : {index}")

Total error in extracting the page 1 : 0


In [36]:
path = "./data/" 
save_json(path, file_name="linguatools_page1.json", data=linguatools_page1)


Successfully saved: linguatools_page1.json


## Runing lingtool page 2 with 5000 rows

In [37]:
url = "https://linguatools.info/?page=2&per_page=5000"
linguatools_page2 = scrape_linguatools_translation(url)

In [38]:
len(linguatools_page2["data"])

3678

In [39]:
index = 0

for keys in linguatools_page2["data"]:
    # print(keys)
    if linguatools_page2["data"][keys]["Message"] != "Success":
        print(keys, linguatools_page2[keys]["Message"])
        index += 1

print(f"Total error in extracting the page 2 : {index}")

Total error in extracting the page 2 : 0


In [40]:
path = "./data/" 
save_json(path, file_name="linguatools_page2.json", data=linguatools_page2)


Successfully saved: linguatools_page2.json


# Fixing the json unicode issues by adding 
- json.dumps(data, ensure_ascii=False)

In [7]:
path = "./data/"
filename = "linguatools_page1.json"
json_data = read_json(path, filename)
len(json_data["data"])

Successfully loaded: linguatools_page1.json


5000

In [9]:
save_json(path, filename, json_data)

Successfully saved: linguatools_page1.json


#### page 2


In [10]:

filename = "linguatools_page2.json"
json_data = read_json(path, filename)
len(json_data["data"])
save_json(path, filename, json_data)

Successfully loaded: linguatools_page2.json
Successfully saved: linguatools_page2.json
