Skip to content

Commit

Permalink
Add pronunciation customization feature
Browse files Browse the repository at this point in the history
-You can now use the files in the "SSML_Customization" folder to specify how certain words should be pronounced.

-You are now also able to add a list of phrases to not translate.

-Fixed output directory in audio_builder.py
  • Loading branch information
ThioJoe committed Jan 17, 2023
1 parent 89d5575 commit 98ca741
Show file tree
Hide file tree
Showing 10 changed files with 152 additions and 16 deletions.
5 changes: 5 additions & 0 deletions SSML_Customization/Example - interpret-as.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Text,interpret-as Type,Case Sensitive (True/False),Format (Optional)
RJ45,characters,TRUE,
SFP,characters,TRUE,
GUI,characters,TRUE,
MSI,characters,TRUE,
29 changes: 29 additions & 0 deletions SSML_Customization/READ THIS.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
This folder contains the following three pronunciation customization files by default.

• dont_translate_phrases.txt
- You can add a list of phrases or words you do not want to be translated.
- This will work for both Google Translate and DeepL

• interpret-as.csv (Azure Only)
- You can use SSML parameters to customize how specific words or phrases are pronounced
- See this article for documentation: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#say-as-element
- Note: The script will match the phrases in the TRANSLATED text. You may therefore wish to also add these phrases to 'dont_translate_phrases.txt'.
- The first row contains the titles of each column - Do not change anything in the first row!
- Descriptions of each column:
• Text: The word or phrase that will be pronounced how you specify, if it is found in the text to be spoken
• interpret-as Type: The way in which the word/phrase will be pronounced. See documentation link above. (Some examples include: characters, cardinal, ordinal)
• Case Sensitive (True/False): Whether to only modify the pronunciation if the word/phrase matches exactly, being case sensitive
• Format (Optional): Only applicable to some types, such as 'date', 'time', and others. Otherwise leave blank. See documentation link above for details
- See 'Example - interpret-as.csv' for an example of how to use this file
- This will only apply if using Azure TTS, not Google

• aliases.csv (Azure Only)
- Lets you effectively change what should be spoken instead of a certain word or phrase
- Example: If the text to be spoken contains "BTW" you can have it say "by the way"
-Note: It does NOT actually replace the text, but only changes how the voice will pronounce it
- The first row contains the titles of each column - Do not change anything in the first row!
- Description of each column:
- Original Text: The original word or phrase to match
- Alias: The word or phrase to speak instead of the original text
- Case Sensitive (True/False): Whether it must be an exact match including capital/lower case. If nothing is entered, will default to False
- This will only apply if using Azure TTS, not Google
1 change: 1 addition & 0 deletions SSML_Customization/aliases.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Original Text,Alias,Case Sensitive (True/False)
3 changes: 3 additions & 0 deletions SSML_Customization/dont_translate_phrases.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Add one word or phrase per line that you do not want to be translated. The original word will be left as-is in the translated srt files.
# Don't include punctuation. This list will NOT be case sensitive
# Lines beginning with a # will be ignored
1 change: 1 addition & 0 deletions SSML_Customization/interpret-as.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Text,interpret-as Type,Case Sensitive (True/False),Format (Optional)
67 changes: 66 additions & 1 deletion TTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
import zipfile
import io
import copy
import re
from urllib.request import urlopen

import auth
import azure_batch
import utils
from utils import parseBool

# Read config files
Expand Down Expand Up @@ -43,6 +45,63 @@ def get_voices():
voices_json = json.dumps(voices)
return voices_json


# ======================================== Pronunciation Correction Functions ================================================

interpretAsOverrideFile = os.path.join('SSML_Customization', 'interpret-as.csv')
interpretAsEntries = utils.csv_to_dict(interpretAsOverrideFile)

aliasOverrideFile = os.path.join('SSML_Customization', 'aliases.csv')
aliasEntries = utils.csv_to_dict(aliasOverrideFile)

def add_all_pronunciation_overrides(text):
text = add_interpretas_tags(text)
text = add_alias_tags(text)
return text

def add_interpretas_tags(text):
for entryDict in interpretAsEntries:
# Get entry info
entryText = entryDict['Text']
entryInterpretAsType = entryDict['interpret-as Type']
isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
entryFormat = entryDict['Format (Optional)']

# Create say-as tag
if entryFormat == "":
sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}">'
else:
sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}" format="{entryFormat}">'

# Find and replace the word
findWordRegex = rf'(\b["\']?{entryText}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
if isCaseSensitive:
text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text) # Uses group reference, so remember regex must be in parentheses

else:
text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text, flags=re.IGNORECASE)
return text

def add_alias_tags(text):
for entryDict in aliasEntries:
# Get entry info
entryText = entryDict['Original Text']
entryAlias = entryDict['Alias']
if entryDict['Case Sensitive (True/False)'] == "":
isCaseSensitive = False
else:
isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])

# Find and replace the word
findWordRegex = rf'\b["\']?{entryText}[.,!?]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
if isCaseSensitive:
text = re.sub(findWordRegex, rf'{entryAlias}', text)
else:
text = re.sub(findWordRegex, rf'{entryAlias}', text, flags=re.IGNORECASE)
return text

# =============================================================================================================================

# Build API request for google text to speech, then execute
def synthesize_text_google(text, speedFactor, voiceName, voiceGender, languageCode, audioEncoding=audioEncoding):
# Keep speedFactor between 0.25 and 4.0
Expand Down Expand Up @@ -111,7 +170,10 @@ def synthesize_text_azure(text, speedFactor, voiceName, languageCode):
if not azureSentencePause == 'default' and azureSentencePause.isnumeric():
pauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{azureSentencePause}ms"/>'
else:
pauseTag = ''
pauseTag = ''

# Process text using pronunciation customization set by user
text = add_all_pronunciation_overrides(text)

# Create SSML syntax for Azure TTS
ssml = f"<speak version='1.0' xml:lang='{languageCode}' xmlns='http://www.w3.org/2001/10/synthesis' " \
Expand Down Expand Up @@ -181,6 +243,9 @@ def create_request_payload(remainingEntriesDict):
else:
pauseTag = ''

# Process text using pronunciation customization set by user
text = add_all_pronunciation_overrides(text)

# Create the SSML for each subtitle
ssml = f"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' " \
"xmlns:mstts='http://www.w3.org/2001/mstts'>" \
Expand Down
8 changes: 4 additions & 4 deletions TitleTranslator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from utils import parseBool
GOOGLE_TTS_API, GOOGLE_TRANSLATE_API = auth.first_authentication()

outputFolder = "output"
outputFolder = "Outputs"

import langcodes
import sys
Expand All @@ -51,7 +51,7 @@

# Parse the description for hyperlinks and put the tags <span class="notranslate"></span> around them
# This prevents Google Translate from translating the links
description = re.sub(r'(https?://[^\s]+)', r' <span class="notranslate">\1</span> ', description)
description = re.sub(r'(https?://[^\s]+)', r' <span class="notranslate">\1</span> ', description, flags=re.IGNORECASE)

# Use span class="notranslate" to prevent translating certain characters
for char in noTranslateList:
Expand Down Expand Up @@ -202,7 +202,7 @@ def translate(originalLanguage, singleLangDict, translationList):
langData['translated_description'].insert(i, '')

# Write the translated text to a file
with open(outputFolder + '/Translated Titles and Descriptions.txt', 'w', encoding='utf-8') as f:
with open(os.path.join(outputFolder , 'Translated Titles and Descriptions.txt'), 'w', encoding='utf-8') as f:
for langNum, langData in batchSettings.items():
title_translated = langData['translated_title']
description_translated = langData['translated_description']
Expand All @@ -227,6 +227,6 @@ def translate(originalLanguage, singleLangDict, translationList):
langData['translated_description'] = '\n'.join(langData['translated_description'])

# Write the translated items to a json file
with open(outputFolder + '/Translated Items.json', 'w', encoding='utf-8') as f:
with open(os.path.join(outputFolder , 'Translated Items.json'), 'w', encoding='utf-8') as f:
json.dump(batchSettings, f, indent=4)

7 changes: 4 additions & 3 deletions audio_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
from pydub.silence import detect_leading_silence
import langcodes

# MOVE THIS INTO A VARIABLE AT SOME POINT
outputFolder = "output"

# Set working folder
workingFolder = "workingFolder"

Expand All @@ -36,6 +33,10 @@
tts_service = cloudConfig['CLOUD']['tts_service']
debugMode = parseBool(config['SETTINGS']['debug_mode'])

# MOVE THIS INTO A VARIABLE AT SOME POINT
outputDirectory = "Outputs"
outputFolder = os.path.join(outputDirectory , os.path.splitext(os.path.basename(originalVideoFile))[0] + ' (Output)')

def trim_clip(inputSound):
trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
Expand Down
26 changes: 18 additions & 8 deletions translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Imports
import auth
from utils import parseBool
import utils

import configparser
from operator import itemgetter
Expand Down Expand Up @@ -39,13 +40,15 @@
outputDirectory = "Outputs"
outputFolder = os.path.join(outputDirectory , os.path.splitext(os.path.basename(originalVideoFile))[0] + ' (Output)')

# ---------------------------------------------------------------------------------------

# Add span tags around certain words to exclude them from being translated
dontTranslateList = [] # Placeholder for now
noTranslateOverrideFile = os.path.join('SSML_Customization', 'dont_translate_phrases.txt')
dontTranslateList = utils.txt_to_list(noTranslateOverrideFile)

def add_notranslate_tags(dontTranslateList, text):
#dontTranslateList = [word.lower() for word in dontTranslateList]
def add_notranslate_tags(text):
for word in dontTranslateList:
findWordRegex = rf'\b["\']?{word}[.,!?]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
findWordRegex = rf'(\b["\']?{word}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
text = re.sub(findWordRegex, r' <span class="notranslate">\1</span> ', text, flags=re.IGNORECASE)
return text

Expand All @@ -57,6 +60,11 @@ def remove_notranslate_tags(text):
# Note: This function was almost entirely written by GPT-3 after feeding it my original code and asking it to change it so it
# would break up the text into chunks if it was too long. It appears to work

def process_response_text(text):
text = html.unescape(text)
text = remove_notranslate_tags(text)
return text

# Translate the text entries of the dictionary
def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
targetLanguage = langDict['targetLanguage']
Expand All @@ -68,11 +76,13 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):

for key in inputSubsDict:
originalText = inputSubsDict[key]['text']
textToTranslate.append(originalText)
# Add the text to the list of text to be translated, and also add the span tags around the words that shouldn't be translated
textToTranslate.append(add_notranslate_tags(originalText))

# Calculate the total number of utf-8 codepoints
codepoints = 0
for text in textToTranslate:
text = add_notranslate_tags(text)
codepoints += len(text.encode("utf-8"))

# If the codepoints are greater than 28000, split the request into multiple
Expand Down Expand Up @@ -108,7 +118,7 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
).execute()

# Extract the translated texts from the response
translatedTexts = [html.unescape(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]
translatedTexts = [process_response_text(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]

# Add the translated texts to the dictionary
# Divide the dictionary into chunks of 100
Expand All @@ -125,7 +135,7 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
result = auth.DEEPL_API.translate_text(chunk, target_lang=targetLanguage, formality=formality)

# Extract the translated texts from the response
translatedTexts = [html.unescape(result[i].text) for i in range(len(result))]
translatedTexts = [process_response_text(result[i].text) for i in range(len(result))]

# Add the translated texts to the dictionary
for i in range(chunkSize):
Expand All @@ -151,7 +161,7 @@ def translate_dictionary(inputSubsDict, langDict, skipTranslation=False):
#'glossaryConfig': {}
}
).execute()
translatedTexts = [html.unescape(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]
translatedTexts = [process_response_text(response['translations'][i]['translatedText']) for i in range(len(response['translations']))]

# Add the translated texts to the dictionary
for i, key in enumerate(inputSubsDict):
Expand Down
21 changes: 21 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import csv

# Interprets a string as a boolean. Returns True or False
def parseBool(string):
if type(string) == str:
if string.lower() == 'true':
Expand All @@ -11,3 +14,21 @@ def parseBool(string):
return False
else:
raise ValueError('Not a valid boolean string')

# Returns a list of dictionaries from a csv file. Where the key is the column name and the value is the value in that column
# The column names are set by the first row of the csv file
def csv_to_dict(csvFilePath):
with open(csvFilePath, "r", encoding='utf-8-sig') as data:
entriesDictsList = []
for line in csv.DictReader(data):
entriesDictsList.append(line)
return entriesDictsList

# Returns a list of strings from a txt file. Ignores empty lines and lines that start with '#'
def txt_to_list(txtFilePath):
with open(txtFilePath, "r", encoding='utf-8-sig') as data:
entriesList = []
for line in data:
if line.strip() != '' and line.strip()[0] != '#':
entriesList.append(line.strip())
return entriesList

0 comments on commit 98ca741

Please sign in to comment.