In [7]:
import requests
from bs4 import BeautifulSoup
import time
import json
from typing import Dict, Tuple, Optional
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


locale = "br"

# Define the file path
file_path = f"{locale}-items.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        items = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    items = []
except ValueError as e:
    print(f"Error: {e}")
    items = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    items = []

print(items["keys"][:10])

[{'w': 'ar', 'r': 42}, {'w': 'bern', 'r': 55}, {'w': 'laouen', 'r': 100}, {'w': "c'hoarvezout", 'r': 100}, {'w': 'letern', 'r': 100}, {'w': 'enaou', 'r': 100}, {'w': 'demat', 'r': 100}, {'w': 'ene', 'r': 101}, {'w': 'soubenn', 'r': 101}, {'w': 'kraoñ', 'r': 101}]


In [33]:
# Calculate average rating
def calculate_average_rating(items: Dict) -> float:
    total_rating = 0
    count = 0

    for item in items:
        if "r" in item:  # Check if item has a rating
            total_rating += item["r"]
            count += 1

    avr = total_rating / count if count > 0 else 0

    print(f"\nAverage rating: {avr:.2f}")
    return avr

avr = calculate_average_rating(items["keys"])


Average rating: 824.63


In [37]:
import random
from math import floor

new_keys = []
for item in tqdm(items["keys"], desc="Processing items"):
    if item["r"] > 530 and item["r"] < 600:
        new_keys.append(
            {"w": item["w"], "r": floor(560 + random.randint(-30, 40))}
        )
    else:
        new_keys.append(item)

calculate_average_rating(new_keys)
# order the items by rating
new_keys.sort(key=lambda x: x["r"])
items["keys"] = new_keys

# Write the updated items back to the JSON file
with open(file_path, "w") as f:
    json.dump(items, f, indent=0, ensure_ascii=False)

Processing items:   0%|                                                                                                                | 0/62169 [00:00<?, ?it/s]

Processing items: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 62169/62169 [00:00<00:00, 1068504.44it/s]



Average rating: 856.84


In [19]:
# Asynchronously fetch all entries in Meurgorf to get their frequency rating (1,2,3)
import requests
from bs4 import BeautifulSoup
import time
import json
from typing import Dict, Tuple, Optional
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def create_session() -> requests.Session:
    """Create a requests session with retry strategy and connection pooling."""
    session = requests.Session()
    
    # Configure retry strategy
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    
    adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=10)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    return session

def scrape_single_entry(n: int, session: requests.Session) -> Optional[Tuple[str, int]]:
    """
    Scrape a single entry from the Breton dictionary.
    
    Args:
        n: Entry number to scrape
        session: Requests session to use
        
    Returns:
        Tuple of (entry_name, rating) or None if failed
    """
    try:
        # Construct the URL for the current entry
        url = f"https://niverel.brezhoneg.bzh/br/meurgorf/{n}"
        
        # Make the HTTP request
        response = session.get(url, timeout=10)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the rating element with class 'rating'
        rating_element = soup.find(class_='rating')
        if not rating_element:
            return None
        
        # Get the data-rating attribute
        rating = rating_element.get('data-rating')
        if not rating:
            return None
        
        # Convert rating to integer
        try:
            rating = int(rating)
        except ValueError:
            return None
        
        # Get the title element from the head
        title_element = soup.find('title')
        if not title_element:
            return None
        
        # Extract the entry name from the title
        title_text = title_element.get_text()
        suffix = " −"
        
        if suffix in title_text:
            entry = title_text.split(suffix)[0].strip()
        else:
            return None
        
        return (entry, rating)
        
    except requests.RequestException:
        return None
    except Exception:
        return None

def scrape_breton_dictionary_ratings(max_workers: int = 20) -> Dict[str, int]:
    """
    Scrapes the Breton dictionary website to collect entries and their ratings using multi-threading.
    
    Args:
        max_workers: Maximum number of concurrent threads
        
    Returns:
        Dict[str, int]: Dictionary mapping entry names to their lowest ratings (1-3)
    """
    dictionary_ratings = {}
    lock = threading.Lock()
    
    # Create a list of all entry numbers
    entry_numbers = list(range(1, 62495))
    
    # Create a progress bar
    with tqdm(total=len(entry_numbers), desc="Scraping entries") as pbar:
        # Use ThreadPoolExecutor for concurrent requests
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Create a session for each worker thread
            sessions = {i: create_session() for i in range(max_workers)}
            
            # Submit all tasks
            future_to_entry = {}
            session_counter = 0
            
            for entry_num in entry_numbers:
                session = sessions[session_counter % max_workers]
                future = executor.submit(scrape_single_entry, entry_num, session)
                future_to_entry[future] = entry_num
                session_counter += 1
            
            # Process completed tasks
            for future in as_completed(future_to_entry):
                entry_num = future_to_entry[future]
                
                try:
                    result = future.result()
                    if result:
                        entry, rating = result
                        
                        # Thread-safe dictionary update
                        with lock:
                            if entry in dictionary_ratings:
                                # If current rating is lower (better) than stored rating, update it
                                if rating > dictionary_ratings[entry]:
                                    dictionary_ratings[entry] = rating
                            else:
                                # Add new entry to dictionary
                                dictionary_ratings[entry] = rating
                                
                except Exception as exc:
                    pass  # Silently ignore errors for cleaner output
                
                pbar.update(1)
            
            # Close all sessions
            for session in sessions.values():
                session.close()
    
    print(f"Scraping complete. Found {len(dictionary_ratings)} unique entries.")
    return dictionary_ratings

def save_ratings_to_file(entries: Dict[str, int], locale: str = "br"):
    """Save the ratings dictionary to a JSON file."""
    output_file_path = f"locales/{locale}/ratings.json"
    
    # Create directory if it doesn't exist
    import os
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(entries, outfile, ensure_ascii=False, indent=2)
    
    print(f"Results saved to {output_file_path}")

if __name__ == "__main__":
    # Set locale (you might want to make this configurable)
    locale = "br"  # Change this as needed
    
    # Scrape the dictionary with multi-threading
    # You can adjust max_workers based on your system and server tolerance
    # 20 is a good balance between speed and being respectful to the server
    entries = scrape_breton_dictionary_ratings(max_workers=20)
    
    # Save to file
    save_ratings_to_file(entries, locale)
    
    # Print some statistics
    print(f"\nStatistics:")
    print(f"Total unique entries: {len(entries)}")
    
    # Count ratings distribution
    rating_counts = {}
    for rating in entries.values():
        rating_counts[rating] = rating_counts.get(rating, 0) + 1
    
    for rating in sorted(rating_counts.keys()):
        print(f"Rating {rating}: {rating_counts[rating]} entries")

Scraping entries: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 62494/62494 [3:19:09<00:00,  5.23it/s]

Scraping complete. Found 55715 unique entries.
Results saved to locales/br/ratings.json

Statistics:
Total unique entries: 55715
Rating 1: 6867 entries
Rating 2: 47740 entries
Rating 3: 1108 entries





In [17]:
# Define the file path
file_path = f"ratings.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        entries = json.loads(content)

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    items = []
except ValueError as e:
    print(f"Error: {e}")
    items = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    items = []

print(len(items["keys"]))

in_devri_not_in_MG = 0
for i in range(len(items['keys'])):
    try:
        item_rating = entries[items["keys"][i]["w"]]
        if item_rating != 1 and item_rating != 2 and item_rating != 3:
            print(item_rating)
        if item_rating != None:
            if item_rating == 1:
                # if word is rare, increase the rating a lot
                items["keys"][i]["r"] -= 0
            elif item_rating == 2:
                # if word is rare, increase the rating a little
                items["keys"][i]["r"] -= 200
            elif items["keys"][i]["r"] < 500:
                # if word is really frequent, decrease its rating
                items["keys"][i]["r"] -= 0
            else:
                items["keys"][i]["r"] += 0
        else:
            in_devri_not_in_MG += 1
                
    except:
        print(items["keys"][i]["w"])
        items["keys"][i]['r'] += 200
print(in_devri_not_in_MG)
        
print(items["keys"][:10])

62169


NameError: name 'tems' is not defined

In [9]:
def save_updated_ratings(entries: Dict[str, int], locale: str = "br"):
    """Save the ratings dictionary to a JSON file."""
    output_file_path = f"locales/{locale}/{locale}-items.json"
    
    # Create directory if it doesn't exist
    import os
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(entries, outfile, ensure_ascii=False, indent=2)
    
    print(f"Results saved to {output_file_path}")

save_updated_ratings(items)

Results saved to locales/br/br-items.json
