Add pronunciation customization feature

-You can now use the files in the "SSML_Customization" folder to specify how certain words should be pronounced. -You are now also able to add a list of phrases to not translate. -Fixed output directory in audio_builder.py
ThioJoe · Jan 17, 2023 · 98ca741 · 98ca741
1 parent 89d5575
commit 98ca741
Show file tree

Hide file tree

Showing 10 changed files with 152 additions and 16 deletions.
diff --git a/SSML_Customization/Example - interpret-as.csv b/SSML_Customization/Example - interpret-as.csv
@@ -0,0 +1,5 @@
+Text,interpret-as Type,Case Sensitive (True/False),Format (Optional)
+RJ45,characters,TRUE,
+SFP,characters,TRUE,
+GUI,characters,TRUE,
+MSI,characters,TRUE,
diff --git a/SSML_Customization/READ THIS.txt b/SSML_Customization/READ THIS.txt
@@ -0,0 +1,29 @@
+This folder contains the following three pronunciation customization files by default.
+
+• dont_translate_phrases.txt
+	- You can add a list of phrases or words you do not want to be translated.
+	- This will work for both Google Translate and DeepL
+
+• interpret-as.csv (Azure Only)
+	- You can use SSML parameters to customize how specific words or phrases are pronounced
+	- See this article for documentation: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#say-as-element
+	- Note: The script will match the phrases in the TRANSLATED text. You may therefore wish to also add these phrases to 'dont_translate_phrases.txt'.
+	- The first row contains the titles of each column - Do not change anything in the first row!
+	- Descriptions of each column:
+		• Text: The word or phrase that will be pronounced how you specify, if it is found in the text to be spoken
+		• interpret-as Type: The way in which the word/phrase will be pronounced. See documentation link above. (Some examples include: characters, cardinal, ordinal)
+		• Case Sensitive (True/False): Whether to only modify the pronunciation if the word/phrase matches exactly, being case sensitive
+		• Format (Optional): Only applicable to some types, such as 'date', 'time', and others. Otherwise leave blank. See documentation link above for details
+	- See 'Example - interpret-as.csv' for an example of how to use this file
+	- This will only apply if using Azure TTS, not Google
+
+• aliases.csv (Azure Only)
+	- Lets you effectively change what should be spoken instead of a certain word or phrase
+	- Example: If the text to be spoken contains "BTW" you can have it say "by the way"
+		-Note: It does NOT actually replace the text, but only changes how the voice will pronounce it
+	- The first row contains the titles of each column - Do not change anything in the first row!
+	- Description of each column:
+		- Original Text: The original word or phrase to match
+		- Alias: The word or phrase to speak instead of the original text
+		- Case Sensitive (True/False): Whether it must be an exact match including capital/lower case. If nothing is entered, will default to False
+	- This will only apply if using Azure TTS, not Google
diff --git a/SSML_Customization/aliases.csv b/SSML_Customization/aliases.csv
@@ -0,0 +1 @@
+Original Text,Alias,Case Sensitive (True/False)
diff --git a/SSML_Customization/dont_translate_phrases.txt b/SSML_Customization/dont_translate_phrases.txt
@@ -0,0 +1,3 @@
+# Add one word or phrase per line that you do not want to be translated. The original word will be left as-is in the translated srt files.
+# Don't include punctuation. This list will NOT be case sensitive
+# Lines beginning with a # will be ignored
diff --git a/SSML_Customization/interpret-as.csv b/SSML_Customization/interpret-as.csv
@@ -0,0 +1 @@
+Text,interpret-as Type,Case Sensitive (True/False),Format (Optional)
diff --git a/TTS.py b/TTS.py
@@ -9,10 +9,12 @@
 import zipfile
 import io
 import copy
+import re
 from urllib.request import urlopen
 
 import auth
 import azure_batch
+import utils
 from utils import parseBool
 
 # Read config files
@@ -43,6 +45,63 @@ def get_voices():
     voices_json = json.dumps(voices)
     return voices_json
 
+
+# ======================================== Pronunciation Correction Functions ================================================
+
+interpretAsOverrideFile = os.path.join('SSML_Customization', 'interpret-as.csv')
+interpretAsEntries = utils.csv_to_dict(interpretAsOverrideFile)
+
+aliasOverrideFile = os.path.join('SSML_Customization', 'aliases.csv')
+aliasEntries = utils.csv_to_dict(aliasOverrideFile)
+
+def add_all_pronunciation_overrides(text):
+    text = add_interpretas_tags(text)
+    text = add_alias_tags(text)
+    return text
+
+def add_interpretas_tags(text):
+    for entryDict in interpretAsEntries:
+        # Get entry info
+        entryText = entryDict['Text']
+        entryInterpretAsType = entryDict['interpret-as Type']
+        isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
+        entryFormat = entryDict['Format (Optional)']
+
+        # Create say-as tag
+        if entryFormat == "":
+            sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}">'
+        else:
+            sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}" format="{entryFormat}">'
+
+        # Find and replace the word
+        findWordRegex = rf'(\b["\']?{entryText}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
+        if isCaseSensitive:
+            text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text) # Uses group reference, so remember regex must be in parentheses
+
+        else:
+            text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text, flags=re.IGNORECASE)
+    return text
+
+def add_alias_tags(text):
+    for entryDict in aliasEntries:
+        # Get entry info
+        entryText = entryDict['Original Text']
+        entryAlias = entryDict['Alias']
+        if entryDict['Case Sensitive (True/False)'] == "":
+            isCaseSensitive = False
+        else:
+            isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
+
+        # Find and replace the word
+        findWordRegex = rf'\b["\']?{entryText}[.,!?]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
+        if isCaseSensitive:
+            text = re.sub(findWordRegex, rf'{entryAlias}', text)
+        else:
+            text = re.sub(findWordRegex, rf'{entryAlias}', text, flags=re.IGNORECASE)
+    return text
+
+# =============================================================================================================================
+
 # Build API request for google text to speech, then execute
 def synthesize_text_google(text, speedFactor, voiceName, voiceGender, languageCode, audioEncoding=audioEncoding):
     # Keep speedFactor between 0.25 and 4.0
@@ -111,7 +170,10 @@ def synthesize_text_azure(text, speedFactor, voiceName, languageCode):
     if not azureSentencePause == 'default' and azureSentencePause.isnumeric():
         pauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{azureSentencePause}ms"/>'
     else:
-        pauseTag = ''    
+        pauseTag = ''
+
+    # Process text using pronunciation customization set by user
+    text = add_all_pronunciation_overrides(text)
 
     # Create SSML syntax for Azure TTS
     ssml = f"<speak version='1.0' xml:lang='{languageCode}' xmlns='http://www.w3.org/2001/10/synthesis' " \
@@ -181,6 +243,9 @@ def create_request_payload(remainingEntriesDict):
             else:
                 pauseTag = ''
 
+            # Process text using pronunciation customization set by user
+            text = add_all_pronunciation_overrides(text)
+
             # Create the SSML for each subtitle
             ssml = f"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' " \
             "xmlns:mstts='http://www.w3.org/2001/mstts'>" \

diff --git a/TitleTranslator.py b/TitleTranslator.py
@@ -35,7 +35,7 @@
 from utils import parseBool
 GOOGLE_TTS_API, GOOGLE_TRANSLATE_API = auth.first_authentication()
 
-outputFolder = "output"
+outputFolder = "Outputs"
 
 import langcodes
 import sys
@@ -51,7 +51,7 @@
 
 # Parse the description for hyperlinks and put the tags <span class="notranslate"></span> around them
 # This prevents Google Translate from translating the links
-description = re.sub(r'(https?://[^\s]+)', r' <span class="notranslate">\1</span> ', description)
+description = re.sub(r'(https?://[^\s]+)', r' <span class="notranslate">\1</span> ', description, flags=re.IGNORECASE)
 
 # Use span class="notranslate" to prevent translating certain characters
 for char in noTranslateList:
@@ -202,7 +202,7 @@ def translate(originalLanguage, singleLangDict, translationList):
         langData['translated_description'].insert(i, '')
 
 # Write the translated text to a file
-with open(outputFolder + '/Translated Titles and Descriptions.txt', 'w', encoding='utf-8') as f:
+with open(os.path.join(outputFolder , 'Translated Titles and Descriptions.txt'), 'w', encoding='utf-8') as f:
     for langNum, langData in batchSettings.items():
         title_translated = langData['translated_title']
         description_translated = langData['translated_description']
@@ -227,6 +227,6 @@ def translate(originalLanguage, singleLangDict, translationList):
         langData['translated_description'] = '\n'.join(langData['translated_description'])
 
     # Write the translated items to a json file
-    with open(outputFolder + '/Translated Items.json', 'w', encoding='utf-8') as f:
+    with open(os.path.join(outputFolder , 'Translated Items.json'), 'w', encoding='utf-8') as f:
         json.dump(batchSettings, f, indent=4)
 
diff --git a/audio_builder.py b/audio_builder.py
@@ -12,9 +12,6 @@
 from pydub.silence import detect_leading_silence
 import langcodes
 
-# MOVE THIS INTO A VARIABLE AT SOME POINT
-outputFolder = "output"
-
 # Set working folder
 workingFolder = "workingFolder"
 
@@ -36,6 +33,10 @@
 tts_service = cloudConfig['CLOUD']['tts_service']
 debugMode = parseBool(config['SETTINGS']['debug_mode'])
 
+# MOVE THIS INTO A VARIABLE AT SOME POINT
+outputDirectory = "Outputs"
+outputFolder = os.path.join(outputDirectory , os.path.splitext(os.path.basename(originalVideoFile))[0] + ' (Output)')
+
 def trim_clip(inputSound):
     trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
     trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()

diff --git a/translate.py b/translate.py
@@ -4,6 +4,7 @@
 # Imports
 import auth
 from utils import parseBool
+import utils
 
 import configparser
 from operator import itemgetter
@@ -39,13 +40,15 @@
 outputDirectory = "Outputs"
 outputFolder = os.path.join(outputDirectory , os.path.splitext(os.path.basename(originalVideoFile))[0] + ' (Output)')
 
+# ---------------------------------------------------------------------------------------
+
 # Add span tags around certain words to exclude them from being translated
-dontTranslateList = [] # Placeholder for now
+noTranslateOverrideFile = os.path.join('SSML_Customization', 'dont_translate_phrases.txt')
+dontTranslateList = utils.txt_to_list(noTranslateOverrideFile)
 
-def add_notranslate_tags(dontTranslateList, text):
-    #dontTranslateList = [word.lower() for word in dontTranslateList]
+def add_notranslate_tags(text):
     for word in dontTranslateList:
-        findWordRegex = rf'\b["\']?{word}[.,!?]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
+        findWordRegex = rf'(\b["\']?{word}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
         text = re.sub(findWordRegex, r' <span class="notranslate">\1</span> ', text, flags=re.IGNORECASE)
     return text
 
@@ -57,6 +60,11 @@ def remove_notranslate_tags(text):
 # Note: This function was almost entirely written by GPT-3 after feeding it my original code and asking it to change it so it
 # would break up the text into chunks if it was too long. It appears to work
 
+def process_response_text(text):
+    text = html.unescape(text)
+    text = remove_notranslate_tags(text)
+    return text
+
 # Translate the text entries of the dictionary
 def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
     targetLanguage = langDict['targetLanguage']
@@ -68,11 +76,13 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
 
     for key in inputSubsDict:
         originalText = inputSubsDict[key]['text']
-        textToTranslate.append(originalText)
+        # Add the text to the list of text to be translated, and also add the span tags around the words that shouldn't be translated
+        textToTranslate.append(add_notranslate_tags(originalText))
 
     # Calculate the total number of utf-8 codepoints
     codepoints = 0
     for text in textToTranslate:
+        text = add_notranslate_tags(text)
         codepoints += len(text.encode("utf-8"))
 
     # If the codepoints are greater than 28000, split the request into multiple
@@ -108,7 +118,7 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
                     ).execute()
 
                     # Extract the translated texts from the response
-                    translatedTexts = [html.unescape(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]
+                    translatedTexts = [process_response_text(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]
 
                     # Add the translated texts to the dictionary
                     # Divide the dictionary into chunks of 100
@@ -125,7 +135,7 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
                     result = auth.DEEPL_API.translate_text(chunk, target_lang=targetLanguage, formality=formality)
 
                     # Extract the translated texts from the response
-                    translatedTexts = [html.unescape(result[i].text) for i in range(len(result))]
+                    translatedTexts = [process_response_text(result[i].text) for i in range(len(result))]
 
                     # Add the translated texts to the dictionary
                     for i in range(chunkSize):
@@ -151,7 +161,7 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
                         #'glossaryConfig': {}
                     }
                 ).execute()
-                translatedTexts = [html.unescape(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]
+                translatedTexts = [process_response_text(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]
 
                 # Add the translated texts to the dictionary
                 for i, key in enumerate(inputSubsDict):

diff --git a/utils.py b/utils.py
@@ -1,3 +1,6 @@
+import csv
+
+# Interprets a string as a boolean. Returns True or False
 def parseBool(string):
     if type(string) == str:
         if string.lower() == 'true':
@@ -11,3 +14,21 @@ def parseBool(string):
             return False
     else:
         raise ValueError('Not a valid boolean string')
+
+# Returns a list of dictionaries from a csv file. Where the key is the column name and the value is the value in that column
+# The column names are set by the first row of the csv file
+def csv_to_dict(csvFilePath):
+    with open(csvFilePath, "r", encoding='utf-8-sig') as data:
+        entriesDictsList = []
+        for line in csv.DictReader(data):
+            entriesDictsList.append(line)
+    return entriesDictsList
+
+# Returns a list of strings from a txt file. Ignores empty lines and lines that start with '#'
+def txt_to_list(txtFilePath):
+    with open(txtFilePath, "r", encoding='utf-8-sig') as data:
+        entriesList = []
+        for line in data:
+            if line.strip() != '' and line.strip()[0] != '#':
+                entriesList.append(line.strip())
+    return entriesList