In [45]:
import requests
import tempfile
import os
import zipfile
from collections import defaultdict
import datasets
from lingua import LanguageDetectorBuilder


def generate_dataset(source, target, output_dir, cache_dir=None, corpus_list: list[str] | None = None):
    """Generate a parallel dataset from OPUS for source and target languages.
    
    Steps:
    1. Download Moses-preprocessed parallel corpora from OPUS for the language pair
    2. Extract files from downloaded zip archives to working directory
    3. Load text files as HuggingFace datasets
    4. Concatenate source and target language datasets
    5. Add corpus identifier to track data source
    
    Args:
        source: Source language code (e.g. 'en')
        target: Target language code (e.g. 'fr') 
        output_dir: Directory to save final dataset
        language_threshold: Minimum confidence threshold for language detection (default: 90)
        cache_dir: Optional directory to cache downloaded files (default: None, uses temp dir)
        corpus_list: Optional list of corpora to download (default: None, download all)
        
    Returns:
        HuggingFace Dataset containing parallel texts and corpus identifiers
    """
    
    # download moses dataset
    url = f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest"
    response = requests.get(url)
    response.raise_for_status()

   
    
    if corpus_list:
        urls = [corpus['url'] for corpus in response.json()['corpora'] if corpus['corpus'] in corpus_list]
    else:
        urls = [corpus['url'] for corpus in response.json()['corpora']]
    
    print(f"downloading {len(urls)} corpora")
    
    # Use cache_dir if provided, otherwise use temp dir
    working_dir = cache_dir if cache_dir else tempfile.mkdtemp()
    os.makedirs(working_dir, exist_ok=True)
    
    try:
        # Download each corpus file if not in cache
        downloaded_file_names = defaultdict(dict)
        for url in urls:
            filename = os.path.basename(url)
            corpus_name = url.split('/')[-4]
            filepath = os.path.join(working_dir, f"{corpus_name}-{filename}")
            
            if not os.path.exists(filepath):
                response = requests.get(url)
                response.raise_for_status()
                
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded: {filepath}")
            else:
                print(f"Using cached file: {filepath}")
            
            with zipfile.ZipFile(filepath, 'r') as zip_ref:
                file_list = zip_ref.namelist()                
                for file_name in file_list:
                    language = file_name.split('.')[-1]
                    corpus = file_name.split('.')[0]
                    if language == source or language == target:
                        extracted_path = os.path.join(working_dir, file_name)
                        if not os.path.exists(extracted_path):
                            zip_ref.extract(file_name, working_dir)
                        downloaded_file_names[corpus][language] = file_name

        # load the files as huggingface datasets
        all_datasets = []
        for corpus, file_names in downloaded_file_names.items():
            source_dataset = datasets.load_dataset('text', data_files=os.path.join(working_dir, file_names[source]))['train']
            source_dataset = source_dataset.rename_column('text', source)
            target_dataset = datasets.load_dataset('text', data_files=os.path.join(working_dir, file_names[target]))['train']
            target_dataset = target_dataset.rename_column('text', target)
            dataset = datasets.concatenate_datasets([source_dataset, target_dataset], axis=1)
            dataset = dataset.map(lambda example: {'corpus': corpus})
            all_datasets.append(dataset)
        all_datasets: datasets.Dataset = datasets.concatenate_datasets(all_datasets, axis=0)
        all_datasets = all_datasets.class_encode_column('corpus')

        # Filter out samples that are not in the source or target language
        detector = LanguageDetectorBuilder.from_all_languages().build()
        def check_language(text: str, lang_id: str) -> bool:
            language = detector.detect_language_of(text)
            if not language:
                return False
            return language.iso_code_639_1.name.lower() == lang_id
        
        original_length = len(all_datasets)
        all_datasets = all_datasets.filter(lambda example: check_language(example[source], source))
        all_datasets = all_datasets.filter(lambda example: check_language(example[target], target))
        filtered_length = len(all_datasets)
        print(f"Filtered {original_length - filtered_length} samples from {original_length} samples. It is now {filtered_length/original_length*100:.2f}% of the original dataset.")
        
        # do not shuffle to keep sentence order (some are paragraphs)
        all_datasets = all_datasets.train_test_split(test_size=0.1, stratify_by_column='corpus', shuffle=False)
        all_datasets.save_to_disk(output_dir)
        
    finally:
        # Clean up temp dir if we created one
        if not cache_dir and os.path.exists(working_dir):
            import shutil
            shutil.rmtree(working_dir)


generate_dataset('en', 'fr', 'output', cache_dir='cache', corpus_list=['bible-uedin', 'ECDC'])

downloading 2 corpora
Using cached file: cache/OPUS-bible-uedin-en-fr.txt.zip
Using cached file: cache/OPUS-ECDC-en-fr.txt.zip


Filter:   0%|          | 0/64756 [00:00<?, ? examples/s]

Filter:   0%|          | 0/63001 [00:00<?, ? examples/s]

Filtered 2138 samples from 64756 samples. It is now 96.70% of the original dataset.


Saving the dataset (0/1 shards):   0%|          | 0/56356 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6262 [00:00<?, ? examples/s]

In [46]:
# doesn't work on VScode, login from cli, 'hugingface-cli login'
if False:
    import huggingface_hub
    huggingface_hub.notebook_login()

In [47]:
import datasets
dataset = datasets.load_from_disk('output')
dataset.push_to_hub('opus-en-fr-small')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/57 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/495 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ShinnosukeU/opus-en-fr-small/commit/f26e9a92e0e68839537f0d700bea318268ff777f', commit_message='Upload dataset', commit_description='', oid='f26e9a92e0e68839537f0d700bea318268ff777f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ShinnosukeU/opus-en-fr-small', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ShinnosukeU/opus-en-fr-small'), pr_revision=None, pr_num=None)

In [48]:
dataset = datasets.load_from_disk('output')


In [49]:
dataset['train'].features

{'en': Value(dtype='string', id=None),
 'fr': Value(dtype='string', id=None),
 'corpus': ClassLabel(names=['ECDC', 'bible-uedin'], id=None)}

In [51]:
dataset['train'][10000]

{'en': 'Open thou mine eyes, that I may behold wondrous things out of thy law.',
 'fr': 'Ouvre mes yeux, pour que je contemple Les merveilles de ta loi!',
 'corpus': 1}