# Translation

This notebook serves as a testing playground for dataset translation.

In [1]:
import time
import random
import functools
import itertools
import requests
from typing import Dict, Any, Callable, Iterable

import pandas as pd

from transformers import pipeline

## Config

Notebook-level config:
* `OUTPUT_DATA_SIZE`: number of rows to be translated per language (can be set to `None`)
* `BATCH_SIZE`: number of records to be translated at once
* `BACKUP_SIZE`: number of records backuped at once

In [25]:
OUTPUT_DATA_SIZE_LIMIT = None  # there are 76879 rows in total
BATCH_SIZE = 100
BACKUP_SIZE = 1_000

BACKUP_PATH = "./backups/"

In [26]:
def _generate_hex_id(id_length: int = 8):
    hex_string = '0123456789abcdef'
    return ''.join([random.choice(hex_string) for x in range(id_length)])

In [27]:
RUN_ID = _generate_hex_id()
RUN_ID

'3d6ba3a3'

## HuggingFace API

### Settings

In [28]:
HEADERS_DFLT = {"Authorization": "Bearer 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"}

In [29]:
API_URLS = {
    "CS": "https://api-inference.huggingface.co/models/Helsinki-NLP/opus-mt-tc-big-en-ces_slk",
    "DE": "https://api-inference.huggingface.co/models/Helsinki-NLP/opus-mt-en-de",
    "SL": "https://api-inference.huggingface.co/models/Helsinki-NLP/opus-mt-en-sla",
}

### Definitions

In [30]:
TranslationFunction = Callable[[Dict[str, str], Dict[str, Any]], Dict[str, Any]]


def _call_hf_api(
    payload: Dict[str, str], 
    api_url, 
    headers: Dict[str, Any] = None
) -> Dict[str, Any]:
    """Performs one HugigngFace API call"""
    if headers is None:
        headers = HEADERS_DFLT
        
    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()


def translate_text_api(
    text: str, 
    api_translation_function: TranslationFunction,
    n_attempts: int = 10,
    sleep_time: float = 1.0,
    verbose: bool = True,
) -> str: 
    """Uses the API translation funciton to translate plain text"""
    for i in range(n_attempts):
        if verbose:
            print(f"(attempt={i + 1}): Attempting to translate `{text}`...")
        response = api_translation_function(text)
        try:
            result = response[0]["translation_text"]
            if verbose:
                print(f"successfully translated: `{text}` -> `{result}`")
            return result
        except KeyError: 
            continue
            
    raise requests.Timeout(f"Text could not be translated: `{text}`")

In [31]:
api_translate_en_to_cs = functools.partial(_call_hf_api, api_url=API_URLS["CS"])
api_translate_en_to_sl = functools.partial(_call_hf_api, api_url=API_URLS["SL"])
api_translate_en_to_de = functools.partial(_call_hf_api, api_url=API_URLS["DE"])

## HuggingFace Pipelines

### Settings

In [32]:
MODELS_DICT = {
    "CS": "Helsinki-NLP/opus-mt-tc-big-en-ces_slk",
    "DE": "Helsinki-NLP/opus-mt-en-de",
    "SL": "Helsinki-NLP/opus-mt-en-sla",
}

### Definitions

In [33]:
def create_pipeline_translator(model_name: str):
    """Returns a translation function that leverages the HF pretrained pipelines"""
    pipe = pipeline("translation", model_name)
    
    def _translate_text(text): 
        try:
            return pipe.predict(text)[0]["translation_text"]
        except (IndexError, KeyError):
            return None
    
    return _translate_text

In [34]:
pipeline_transate_en_to_cs = create_pipeline_translator(model_name=MODELS_DICT["CS"])
pipeline_transate_en_to_de = create_pipeline_translator(model_name=MODELS_DICT["DE"])
pipeline_transate_en_to_sl = create_pipeline_translator(model_name=MODELS_DICT["SL"])

## Paraphrase data

Translation of bi-texts so that we obtain paraphrasing datasets.

### Settings

In [35]:
DATA_DIR_PATH = "./data/data_filtered/"

DATA_PATHS = {
    "CS": DATA_DIR_PATH + "cz_en_filtered.csv",
    "DE": DATA_DIR_PATH + "de_en_filtered.csv",
    "SL": DATA_DIR_PATH + "sl_en_filtered.csv",
}

### Definitions

In [36]:
def load_data(path: str, limit: int = None):
    """Wrapper around data loading (all datasets are probably gonna be in the same format)"""
    df_full = pd.read_csv(path)
    if limit is None or limit >= df_full.shape[0]:
        return df_full
    drop_idx = pd.RangeIndex(start=limit, stop=df_full.shape[0])
    return df_full.drop(drop_idx)

In [44]:
def _batched(iterable: Iterable, n: int):
    "Batches data into tuples of length n. The last batch may be shorter."
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while batch := tuple(itertools.islice(it, n)):
        yield batch
        
        
def _create_backup(
    data: pd.DataFrame,
    file_name: str,
    backup_dir_path: str = BACKUP_PATH,
    verbose: bool = False,
):
    """Backups the dataframe"""
    path_save = f"{backup_dir_path}{RUN_ID}_{file_name}"
    data.to_csv(path_save)
    if verbose:
        print(f"[BACKUP] saving {data.shape[0]} rows @ `{path_save}`")
        
        
def create_paraphrase_dataset(
    df_bitexts: pd.DataFrame,
    api_translation_function: Callable[[str], str],
    source_col: str, 
    original_col: str,
    batch_size: int = BATCH_SIZE,
    backup_size: int = BACKUP_SIZE,
    inplace: bool = False,
    verbose: bool = False,
) -> pd.DataFrame:
    """Translates a source column of a bitext dataset, outputting a paraphrasing dataset."""
    
    if verbose:
        time_total = 0
        display(df_bitexts.head())
        print(f"--------\nParaphrase dataset creation started: {df_bitexts.shape[0]} rows--------")
        
    df_final = pd.DataFrame(columns=["Original", "Parahprase"])
    for chunk_id, chunk_rows in enumerate(_batched(df_bitexts.iterrows(), n=backup_size)):
        try:
            if verbose:
                start = time.perf_counter()
            
            chunk_texts = [row[source_col] for _, row in chunk_rows]
            chunk_indices = [index for index, _ in chunk_rows]
            # translate current chunk
            col_translated = pd.Series(dtype="str")
            for batch in _batched(chunk_texts, n=batch_size):
                try:
                    batch_translated = pd.Series(batch, dtype="str").apply(api_translation_function)
                    col_translated = pd.concat([col_translated, batch_translated])
                except Exception as e: 
                    if verbose:
                        print(f"[Skipping current batch] Caught: {e}")                        
            col_translated.index = chunk_indices

            if verbose:
                time_diff = (time.perf_counter() - start)
                time_total += time_diff
                print(f"[create_paraphrase_dataset] chunk {chunk_id}: {col_translated.size:,} / {len(chunk_indices)} records translated in {time_diff:.2f} s.")

            # make the chunk into a DF
            df_src = df_bitexts.loc[chunk_indices, original_col]
            df_src = df_src if inplace else df_src.copy()
            df_out = pd.concat([df_src, col_translated], axis=1)
            df_out.columns = df_final.columns

            _create_backup(
                df_out, 
                file_name=f"translate_from_{source_col}_to_{original_col}_{chunk_id}",
                verbose=verbose,
            )
            
        except Exception as e:
            # TODO: anti-pattern, remove this
            if vebrose:
                print(f"[SKIPPING] Caught: {e}")
            
    
    df_final = pd.concat([df_final, df_out], axis=0)

    if verbose:
        display(df_final.head())
        print(f"[create_paraphrase_dataset] TOTAL TIME: {time_total:.2f} s.")

    return df_final

In [38]:
load_data_cs = functools.partial(load_data, path=DATA_PATHS["CS"], limit=OUTPUT_DATA_SIZE_LIMIT)
load_data_de = functools.partial(load_data, path=DATA_PATHS["DE"], limit=OUTPUT_DATA_SIZE_LIMIT)
load_data_sl = functools.partial(load_data, path=DATA_PATHS["SL"], limit=OUTPUT_DATA_SIZE_LIMIT)

### Load data

In [39]:
data_cs = load_data_cs()
data_de = load_data_de()
data_sl = load_data_sl()

### Translate

In [40]:
translate_en_to_cs_api = functools.partial(
    translate_text_api, 
    api_translation_function=api_translate_en_to_cs
)
translate_en_to_de_api = functools.partial(
    translate_text_api, 
    api_translation_function=api_translate_en_to_de
)
translate_en_to_sl_api = functools.partial(
    translate_text_api, 
    api_translation_function=api_translate_en_to_sl
)

In [45]:
df_paraphrase_de = create_paraphrase_dataset(
    df_bitexts=data_de,
    api_translation_function=pipeline_transate_en_to_de,
    source_col="English",
    original_col="German",
    verbose=True,
)

Unnamed: 0,English,German
0,You can download the tool from this guide incl...,Sie können das Tool von dieser Anleitung einsc...
1,!{Star Trek} medals are used to unlock future ...,Mit Star Trek-Medaillen lassen sich zukünftige...
2,""" Egyptian mummies were made as """" built like ...",""" Ägyptischen Mumien wurden gemacht """" gebaut ..."
3,""" Enablers"" , i.e. the basic building blocks w...",„ Grundlagen“: grundlegende Bausteine zur Förd...
4,""" I am very thankful for this USB Security sof...","""Ich bin sehr dankbar für diese USB Security-S..."


Paraphrase dataset creation started: 10 rows
[create_paraphrase_dataset] chunk 0: 2 / 2 records translated in 0.53 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_German_0`
[create_paraphrase_dataset] chunk 1: 2 / 2 records translated in 0.93 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_German_1`
[create_paraphrase_dataset] chunk 2: 2 / 2 records translated in 0.85 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_German_2`
[create_paraphrase_dataset] chunk 3: 2 / 2 records translated in 0.82 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_German_3`
[create_paraphrase_dataset] chunk 4: 2 / 2 records translated in 1.12 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_German_4`


Unnamed: 0,Original,Parahprase
8,„ Outputs“ : Vorteile für die Wirtschaft als G...,"""Outputs"", die zeigen, wie sich dies in Vortei..."
9,„Die Initiative ‚Chancen für junge Menschen‘ z...,"""Die Initiative ""Jugendchancen"" – sagte der Pr..."


[create_paraphrase_dataset] TOTAL TIME: 4.25 s.


In [42]:
df_paraphrase_sl = create_paraphrase_dataset(
    df_bitexts=data_sl,
    api_translation_function=pipeline_transate_en_to_sl,
    source_col="English",
    original_col="Slovenian",
    verbose=True,
)

Unnamed: 0,English,Slovenian
0,You can download the tool from this guide incl...,"Lahko prenesete orodje iz tega priročnika, vkl..."
1,!{Star Trek} medals are used to unlock future ...,Medalje Star Trek se uporabljajo za odklep bod...
2,""" Egyptian mummies were made as """" built like ...",""" Egipčanske mumije so bile narejene kot """" zg..."
3,""" Enablers"" , i.e. the basic building blocks w...","„potencial“ : temeljni gradniki, ki omogočajo ..."
4,""" I am very thankful for this USB Security sof...","""Zelo sem hvaležen za to USB varnostne program..."


[create_paraphrase_dataset] chunk 0: 2 / 2 records translated in 0.63 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Slovenian_0`
[create_paraphrase_dataset] chunk 1: 2 / 2 records translated in 1.31 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Slovenian_1`
[create_paraphrase_dataset] chunk 2: 2 / 2 records translated in 0.93 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Slovenian_2`
[create_paraphrase_dataset] chunk 3: 2 / 2 records translated in 0.96 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Slovenian_3`
[create_paraphrase_dataset] chunk 4: 2 / 2 records translated in 1.17 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Slovenian_4`


Unnamed: 0,Original,Parahprase
8,"„izkupiček“ : kazalniki, ki kažejo, kakšne so ...","""Outputs"", koji pokazuju, kako se to pretvara ..."
9,"„Pobuda „Priložnosti za mlade“, je povedal pre...","""Iniciativa za mlade mogućnosti – predsednik E..."


[create_paraphrase_dataset] TOTAL TIME: 4.99 s.


In [43]:
df_paraphrase_cs = create_paraphrase_dataset(
    df_bitexts=data_cs,
    api_translation_function=pipeline_transate_en_to_cs,
    source_col="English",
    original_col="Czech",
    verbose=True,
)

Unnamed: 0,English,Czech
0,You can download the tool from this guide incl...,Zde si můžete stáhnout nástroj z této příručky...
1,!{Star Trek} medals are used to unlock future ...,Medaile Star Trek se používají k otevření dalš...
2,""" Egyptian mummies were made as """" built like ...",""" Egyptských mumií byly jako """" postavený jako..."
3,""" Enablers"" , i.e. the basic building blocks w...","„ Předpoklady “, tj. základní stavební kameny,..."
4,""" I am very thankful for this USB Security sof...","""Jsem velmi vděčný za tuto USB bezpečnostní so..."


[create_paraphrase_dataset] chunk 0: 2 / 2 records translated in 1.07 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Czech_0`
[create_paraphrase_dataset] chunk 1: 2 / 2 records translated in 1.97 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Czech_1`
[create_paraphrase_dataset] chunk 2: 2 / 2 records translated in 1.69 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Czech_2`
[create_paraphrase_dataset] chunk 3: 2 / 2 records translated in 2.02 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Czech_3`
[create_paraphrase_dataset] chunk 4: 2 / 2 records translated in 2.19 s.
[BACKUP] saving 2 rows @ `./backups/3d6ba3a3_translate_from_English_to_Czech_4`


Unnamed: 0,Original,Parahprase
8,"„ Výstupy “, které ukazují, jak výše uvedené f...","""Výstupy"", které ukazují, jak se to promítá do..."
9,"„Iniciativa „Příležitosti pro mladé“,“ uvedl p...",Iniciativa Příležitosti pro mladé – předseda E...


[create_paraphrase_dataset] TOTAL TIME: 8.93 s.


---

# Playground

---

In [None]:
pipe = pipeline("translation", "Helsinki-NLP/opus-mt-en-sla")
pipe.predict("some random text")

In [None]:
data = load_data(path=DATA_PATHS["CS"], limit=None)
n_lines = data.shape[0]
avg_text_len = data.loc[:, "English"].apply(lambda x: len(x)).mean()

n_chars_total = n_lines  * avg_text_len
n_chars_total