In [1]:
import pandas as pd
import time
from random import randint
from googletrans import Translator
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError
import logging
import os

In [None]:
FINAL_OUTPUT_CSV = 'full_translated_wikiner.csv'
BATCH_SIZE = 250
MAX_WORKERS = 6
TRANSLATION_TIMEOUT = 45  
LANGUAGES = ['be', 'sl', 'sk']

In [None]:
logging.basicConfig(
    filename='translation_log.log',
    encoding='utf-8',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
df = pd.read_csv('filtered_wikiner_testsplit.csv') 
df = df.copy()  

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4214 entries, 0 to 4213
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               4214 non-null   object
 1   words            4214 non-null   object
 2   ner_tags         4214 non-null   object
 3   marked_sentence  4214 non-null   object
 4   ner_type         4214 non-null   object
 5   entity_words     4214 non-null   object
dtypes: object(6)
memory usage: 197.7+ KB


In [None]:
translator = Translator()

In [None]:
def ilovetranslating(sentence, languages=LANGUAGES, retries=5):
    '''i translate to be sl sk'''
    attempt = 0
    while attempt < retries:
        try:
            if len(sentence) > 5000:
                raise ValueError('Превышает лимит по кол-ву символов!')
            translations = {}
            for lang in languages:
                translated = translator.translate(sentence, dest=lang).text
                if not translated:
                    raise ValueError(f'None вместо перевода, язык - {lang}')
                translations[lang] = translated
            return translations
        except Exception as e:
            logging.warning(f'попытка {attempt + 1}/{retries}, ошибка: {e}')
            time.sleep(randint(5, 15))
            attempt += 1
    return {lang: f'translation failed {sentence}' for lang in languages}

In [None]:
def translate_timeout(sentence):
    '''wrapper для ilovetranslating'''
    with ThreadPoolExecutor(max_workers=1) as single_executor:
        future = single_executor.submit(ilovetranslating, sentence)
        return future.result(timeout=TRANSLATION_TIMEOUT)

In [None]:
def run_translation():
    '''перевод по батчам'''
    allbatches = len(df) // BATCH_SIZE + (1 if len(df) % BATCH_SIZE != 0 else 0)
    logging.info(f'начинается перевод {len(df)} строки в {allbatches} батчей')

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        for batch_num, start in enumerate(range(0, len(df), BATCH_SIZE), 1):
            end = min(start + BATCH_SIZE, len(df))
            batch_df = df.iloc[start:end].copy()
            sentences = batch_df['marked_sentence'].tolist()

            future_to_index = {
                executor.submit(translate_timeout, sentence): i
                for i, sentence in enumerate(sentences)
            }

            results = [{} for _ in range(len(sentences))]

            for future in as_completed(future_to_index):
                i = future_to_index[future]
                try:
                    result = future.result()
                    results[i] = result
                except TimeoutError:
                    logging.error(f'timeout at index {start + i}')
                    results[i] = {lang: f'timeout {sentences[i]}' for lang in LANGUAGES}
                except Exception as e:
                    logging.error(f'exception at index {start + i}: {e}')
                    results[i] = {lang: f'ошибка {sentences[i]}' for lang in LANGUAGES}

            for i, translations in enumerate(results):
                for lang in LANGUAGES:
                    col_name = f'translated_{lang}'
                    batch_df.at[start + i, col_name] = translations.get(lang, '')

            # соединение с исходным df
            for lang in LANGUAGES:
                col_name = f'translated_{lang}'
                if col_name not in df.columns:
                    df[col_name] = ''
                df.loc[start:end-1, col_name] = batch_df[col_name].values

            checkpoint_file = f'translation_checkpoint_{start}_{end}.csv'
            batch_df.to_csv(checkpoint_file, index=False)
            logging.info(f'сохранён {checkpoint_file} для батча {batch_num}/{allbatches}')

    df.to_csv(FINAL_OUTPUT_CSV, index=False)
    logging.info(f'все батчи переведены и сохранены в {FINAL_OUTPUT_CSV}')

In [None]:
if __name__ == '__main__':
    run_translation()

CONCATENATING TRANSLATED FRAGMENTS

In [None]:
path = 'D:\thesisscripts'

In [4]:
files = [file for file in os.listdir(path) if file.endswith('.csv') ]

In [5]:
dfs = []
for file in files:
    df = pd.read_csv(os.path.join(path, file))
    dfs.append(df)

In [6]:
df = pd.concat(dfs, ignore_index=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5410 entries, 0 to 5409
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     5410 non-null   object
 1   words                  5410 non-null   object
 2   ner_tags               5410 non-null   object
 3   marked_sentence        5410 non-null   object
 4   ner_type               5410 non-null   object
 5   entity_words           5410 non-null   object
 6   translated_belarusian  2507 non-null   object
 7   translated_slovenian   2507 non-null   object
 8   translated_slovak      2507 non-null   object
 9   translated_be          1907 non-null   object
 10  translated_sl          1907 non-null   object
 11  translated_sk          1907 non-null   object
dtypes: object(12)
memory usage: 507.3+ KB


In [9]:
df.sample(5)

Unnamed: 0,id,words,ner_tags,marked_sentence,ner_type,entity_words,translated_belarusian,translated_slovenian,translated_slovak,translated_be,translated_sl,translated_sk
2726,en-doc6251-sent35,"['At', 'the', 'time', 'of', 'its', 'discovery'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","At the time of its discovery , the comet was i...",LOC,Jupiter,На момант свайго адкрыцця Камета знаходзілася ...,V času svojega odkritja je bil komet v orbiti ...,V čase svojho objavenia bola kométa na obežnej...,,,
2730,en-doc6059-sent62,"['The', 'discovery', 'of', 'his', 'fraud', ','...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","The discovery of his fraud , the displeasure o...",PER,Aetius,"Адкрыццё яго махлярства, незадавальненне імпер...","Odkritje njegove goljufije, nezadovoljstvo ces...","Objav jeho podvodu, nelibosť cisárovnej a význ...",,,
170,en-doc6252-sent14,"['Several', 'authors', 'have', 'also', 'publis...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Several authors have also published books titl...,PER,Lovecraft,,,,,,
2696,en-doc6305-sent59,"['The', 'three', 'points', 'of', 'the', 'tride...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",The three points of the trident represent in [...,LOC,Barbados,Тры пункты трызуба ўяўляюць у [Барбадаса] тры ...,Tri točke Tridenta na [Barbadosu] predstavljaj...,Tri body Tridenta predstavujú v [Barbados] tri...,,,
4554,en-doc5948-sent0,"['Its', '23-year', 'run', 'made', 'The', 'Ed',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Its 23-year run made The Ed Sullivan Show one ...,LOC,U.S.,,,,Яго 23-гадовы прабег зрабіў шоу Эда Салівана а...,Zaradi 23-letnega teka je Ed Sullivan pokazal ...,Jeho 23-ročný beh urobil z výstavy Ed Sullivan...


In [10]:
df['translated_belarusian'] = df['translated_belarusian'].combine_first(df['translated_be'])
df['translated_slovenian'] = df['translated_slovenian'].combine_first(df['translated_sl'])
df['translated_slovak'] = df['translated_slovak'].combine_first(df['translated_sk'])

In [14]:
df = df.drop(columns=['translated_be', 'translated_sl', 'translated_sk'])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4414 entries, 0 to 5409
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     4414 non-null   object
 1   words                  4414 non-null   object
 2   ner_tags               4414 non-null   object
 3   marked_sentence        4414 non-null   object
 4   ner_type               4414 non-null   object
 5   entity_words           4414 non-null   object
 6   translated_belarusian  4414 non-null   object
 7   translated_slovenian   4414 non-null   object
 8   translated_slovak      4414 non-null   object
dtypes: object(9)
memory usage: 344.8+ KB


In [19]:
df.to_csv('combined_translated_templates.csv')

In [None]:
#from google.cloud import translate_v2 as translate

In [1]:
from googletrans import Translator

translator = Translator()
result = translator.translate('Hello', src='en', dest='es')
print(result.text)  # Get the translated text

Hola
