In [4]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = 'cpu'
print("Device:", device)

Device: cuda:0


In [5]:
# Use a pipeline as a high-level helper
from transformers import pipeline

# pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr", device=device) # Bad perfs
pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-en-fr", batch_size=16, device=device) # Decent perfs, quite fast
# pipe_translate = pipeline("translation", model="jbochi/madlad400-3b-mt", batch_size=16, device=device) # Verfy slow, haven't found the right conf to make it work properly
test_text = "Hello, I'm Quentin, how are you today? I have a medical problem I'd like to discuss."
translation = pipe_translate(test_text)
print("Translation:", translation)

Translation: [{'translation_text': "Bonjour, je suis Quentin, comment allez-vous aujourd'hui? J'ai un problème médical que je voudrais discuter."}]


In [11]:
import zipfile
from bs4 import BeautifulSoup
from tqdm import tqdm

debug = False
debug_chapter = 15
debug_i = -1

# file_name = "epubs/King of Scars by Leigh Bardugo.epub"
file_name = "epubs/Rule of Wolves.epub"
input_archive = zipfile.ZipFile(file_name, "r")
if debug:
    output_archive = zipfile.ZipFile(file_name+"_traduction_debug.epub", "w")
else:
    output_archive = zipfile.ZipFile(file_name+"_traduction.epub", "w")
file_list = input_archive.infolist()
print(file_list)

for x in tqdm(range(0, len(file_list))):
    item = input_archive.open(file_list[x])
    content = item.read()
    
    if file_list[x].filename.endswith(".html"):# and "0009" in file_list[x].filename:
        # modification = '''<?xml version='1.0' encoding='utf-8'?>
        #                 <html xmlns="http://www.w3.org/1999/xhtml">
        #                 <head>
        #                     <title>1 Dima</title>
                            
        #                 <link href="../Styles/stylesheet.css" rel="stylesheet" type="text/css"/>
        #                 <link href="../Styles/page_styles.css" rel="stylesheet" type="text/css"/>
        #                 </head>
        #                 <body id="ukwsaIexvXoTa2hwYWoxGfG" class="calibre">
        #                 <p class="linespace"> </p>
        #                 <div class="imgcenter">
        #                 <p class="chheadimg3"><img src="../Images/00007.jpeg" width="70%" height="auto" alt="images" class="calibre1"/></p>
        #                 </div>
        #                 <p class="linespace1"></p>
        #                 <p class="tx"><span class="big">DIMA HEARD THE BARN DOORS</span> slam before anyone else did. Inside the little farmhouse, the kitchen bubbled like a pot on the stove, its windows shut tight against the storm, the air in the room warm and moist. The walls rattled with the rowdy din of Dima’s brothers talking over one another, as his mother hummed and thumped her foot to a song Dima didn’t know. She held the torn sleeve of one of his father’s shirts taut in her lap, her needle pecking at the fabric in the uneven rhythm of an eager sparrow, a skein of wool thread trailing between her fingers like a choice worm.</p>
        #                 </body>
        #                 </html>
        #                 '''
        debug_i +=1
        if debug and debug_i != debug_chapter:
            continue
    
        html = input_archive.read(file_list[x].filename)
        soup = BeautifulSoup(html, 'html.parser')
        for i in soup.find_all('p'):#, ["tx", "txt"]):
            sentences = i.text.split(". ")
            
            # Making sure there are no words only in uppercase (otherwise then don't get translated...)
            for idx, sentence in enumerate(sentences):
                if "." not in sentence:
                    sentence = sentence + ". " # Adding the ending dot back. Otherwise some sentences don't make sense and get translated wrong
                    sentences[idx] = sentence
                found_upper = False
                for word in sentence.split(" "):
                    if word.isupper():
                        found_upper = True
                        sentence = sentence.replace(word, word.lower())
                        # print("Replaced Upper case words:", word, "by", word.lower())
                        # print("Updated sentence (ONGOING):", sentence)
                if found_upper:
                    sentence = sentence[0].upper() + sentence[1:]
                    # print("Corrected sentence:", sentence)
                    sentences[idx] = sentence
                    # print("by:", sentences[idx])

            # print("\n\nFound to translate:", sentences)
            translated = pipe_translate(sentences)
            # print("Translated text:", translated)
            
            translated_aggregated = "<p class=\"txt\">" + translated[0]["translation_text"]
            
            for sentence in translated[1:]:
                translated_aggregated += " " + sentence["translation_text"]
            translated_aggregated += "</p>\n"
            i.replace_with(translated_aggregated)

        output_archive.writestr(file_list[x].filename, soup.prettify("utf-8", formatter=None))
        # pprint(soup.prettify("utf-8"))
    else:
        #For the other file types, simply copy the original content:
        output_archive.writestr(file_list[x].filename, content)

input_archive.close()
output_archive.close()

[<ZipInfo filename='mimetype' filemode='?rw-------' file_size=20>, <ZipInfo filename='META-INF/' compress_type=deflate filemode='?rwxr-xr-x' compress_size=2>, <ZipInfo filename='META-INF/container.xml' compress_type=deflate filemode='?rw-------' file_size=244 compress_size=154>, <ZipInfo filename='images/' compress_type=deflate filemode='?rwxr-xr-x' compress_size=2>, <ZipInfo filename='images/00005.jpeg' compress_type=deflate filemode='-rw-r--r--' file_size=28157 compress_size=25289>, <ZipInfo filename='images/00007.gif' compress_type=deflate filemode='-rw-r--r--' file_size=1768 compress_size=1522>, <ZipInfo filename='images/00009.jpeg' compress_type=deflate filemode='-rw-r--r--' file_size=11716 compress_size=10902>, <ZipInfo filename='images/00004.jpeg' compress_type=deflate filemode='-rw-r--r--' file_size=56712 compress_size=56573>, <ZipInfo filename='images/00003.jpeg' compress_type=deflate filemode='-rw-r--r--' file_size=6448 compress_size=5536>, <ZipInfo filename='images/00002.jpe

