In [21]:
import parselmouth
import textgrids
import re
import matplotlib.pyplot as plt
from parselmouth.praat import call

In [22]:
# Paths to the required files
son = './data/Logatomes.wav'
grille = './data/Logatomes.TextGrid'
synthese = './data/results/resultat.wav'
sentences_file = './data/sentences.txt'
chemin_dico = './data/dico_UTF8.txt'

# Loading the sound file and the TextGrid for segmentation
sound = parselmouth.Sound(son)
segmentation = textgrids.TextGrid(grille)

In [23]:
# Loading the sound file and the TextGrid for segmentation
try:
    sound = parselmouth.Sound(son)
    segmentation = textgrids.TextGrid(grille)
except FileNotFoundError as e:
    print(f"Erreur de chargement des fichiers : {e}")
    exit(1)

In [24]:
# Retrieving diphones from the TextGrid
diphones = segmentation['phonemes']

# Initialize a variable for concatenating sounds
start = sound.extract_part(0, 0.01, parselmouth.WindowShape.RECTANGULAR, 1, False)
# List of long vowels in French (SAMPA notation)
long_vowels = ['E', 'a', 'o', 'C', 'A', 'I', '2']
vowels = ['e', 'E', 'a', 'A', 'O', 'o', '9', '2', 'i', 'y', 'u', 'I']
elongating_consonants = ['R', 'v', 'z', 'Z']

In [25]:
# Initializing and filling the SAMPA pronunciation dictionary
def initialiser_dico(chemin_dico):
    """
    Initialize and fill a dictionary for SAMPA pronunciation.

    This function reads a file containing mappings from French words to their
    SAMPA (Speech Assessment Methods Phonetic Alphabet) phonetic transcription.
    Each line in the file is expected to have a word and its SAMPA representation,
    separated by a tab character.

    Args:
    chemin_dico (str): Path to the file containing the SAMPA dictionary.

    Returns:
    dict: A dictionary where keys are French words and values are their SAMPA transcriptions.
    """
    dico = {}
    with open(chemin_dico, 'r') as f:
        for line in f:
            key, value = line.strip().split('\t')
            dico[key] = value
    return dico

In [26]:
def should_make_liaison(mot_phonetique, mot_suivant, dico):
    """
    Determine if a liaison should be made between two words in French.

    In French phonetics, a liaison is the pronunciation of a usually silent consonant 
    at the end of a word immediately before a word that begins with a vowel sound. 
    This function simplifies the complex rules of liaison in French.

    Args:
    mot_phonetique (str): The phonetic representation of the current word.
    mot_suivant (str): The phonetic representation of the next word.
    dico (dict): A dictionary mapping words to their phonetic representations.

    Returns:
    bool: True if a liaison should be made, False otherwise.
    """
    if mot_phonetique.endswith(("e", "es", "ent")) and dico.get(mot_suivant, "").startswith(("a", "e", "i", "o", "u", "y")):
        return True
    return False

In [27]:
def conversion_SAMPA_avec_liaisons(phrase):
    """
    Convert a French sentence to its SAMPA phonetic representation, 
    considering the liaisons between words.

    This function processes each word in the sentence, translating it to 
    its phonetic representation based on a SAMPA dictionary. It also 
    determines if a liaison (a specific feature of French phonetics where 
    certain consonants are pronounced at the end of a word when followed 
    by a vowel sound) is necessary between consecutive words.

    Args:
    phrase (str): The original French sentence.

    Returns:
    str: The sentence converted into SAMPA phonetic script with appropriate liaisons.
    """
    # Remove special characters '?', '.', and '!'
    phrase = re.sub(r'[?.!]', '', phrase)

    dico = initialiser_dico(chemin_dico)
    mots = phrase.split(" ")
    phrase_phonetique = []

    for i, mot in enumerate(mots):
        # Check if the word ends with a comma
        virgule_presente = mot.endswith(',')

        # Remove the comma from the word, if present
        mot_sans_virgule = mot.replace(',', '')

        mot_phonetique = dico.get(mot_sans_virgule, mot_sans_virgule)
        # print(mot_sans_virgule, mot_phonetique)

         # If the phonetic word is the same as the original and contains uppercase, search in lowercase
        if mot_phonetique == mot_sans_virgule and any(c.isupper() for c in mot_sans_virgule):
            mot_phonetique = dico.get(mot_sans_virgule.lower(), mot_sans_virgule)


        phrase_phonetique.append(mot_phonetique)

        # If the original word ends with a comma, add "_" after it
        if virgule_presente:
            phrase_phonetique.append("_")

        # Check if a liaison is necessary, ignoring spaces after a comma
        if i < len(mots) - 1 and not mot.endswith(','):  # If it's not the last word and not followed by a comma
            mot_suivant = mots[i + 1]
            if should_make_liaison(mot_phonetique, mot_suivant, dico):
                phrase_phonetique.append("z")  # Add 'z' for the liaison

    phrase_phonetique = "".join(phrase_phonetique)
    phrase_phonetique = "_" + phrase_phonetique + "_"
    return phrase_phonetique


In [28]:
def extraction_diphone(diphones, diphone1, diphone2, sound):
    """
    Extract a specific diphone from a given sound based on the phoneme information.

    This function scans through a list of phonemes, searching for a specific pair 
    that forms the desired diphone. It then calculates the midpoints of these phonemes 
    and extracts the part of the sound that corresponds to the diphone.

    Args:
    diphones (list): A list of phoneme objects from a TextGrid.
    diphone1 (str): The first phoneme of the diphone to be extracted.
    diphone2 (str): The second phoneme of the diphone to be extracted.
    sound (parselmouth.Sound): The sound object from which the diphone is to be extracted.

    Returns:
    parselmouth.Sound: A new sound object containing the extracted diphone. 
                       Returns None if the diphone is not found.
    """ 
    for i in range(len(diphones) - 1):
        # Check if the current and next phonemes match the diphones we're looking for
        if diphones[i].text == diphone1 and diphones[i+1].text == diphone2:
            # Calculate the midpoints of the phonemes for extracting the diphone
            # This helps in determining the exact portion of the sound to extract
            milieu_phoneme1 = (diphones[i].xmin + diphones[i].xmax) / 2
            milieu_phoneme2 = (diphones[i+1].xmin + diphones[i+1].xmax) / 2

            # Find the nearest zero-crossing points to these midpoints
            # Zero-crossing points are used to minimize clicking sounds at the splice points
            milieu_phoneme1 = sound.get_nearest_zero_crossing(milieu_phoneme1, 1)
            milieu_phoneme2 = sound.get_nearest_zero_crossing(milieu_phoneme2, 1)

            # Extract the diphone part from the sound
            # This creates a new sound object containing just the diphone
            return sound.extract_part(milieu_phoneme1, milieu_phoneme2, parselmouth.WindowShape.RECTANGULAR, 1, False)

    # Return None if the diphone is not found
    return None

In [29]:
def contains_long_vowel(diphone):
    """
    Check if the diphone contains any long vowel.
    
    Args:
    diphone (str): A diphone string to be checked.

    Returns:
    bool: True if the diphone contains a long vowel, False otherwise.
    """
    for vowel in long_vowels:
        if vowel in diphone:
            return True
    return False

In [30]:
def elongating_vowel_consonant(diphone):
    """
    Check if the diphone consists of an elongating vowel followed by a consonant.
    This function is relevant for determining duration modifications in speech synthesis.

    Args:
    diphone (str): A diphone string to be checked.

    Returns:
    bool: True if the diphone is formed by an elongating vowel followed by a consonant, False otherwise.
    """
    if len(diphone) <= 2:
        return False
    return diphone[0] in vowels and diphone[1] in elongating_consonants


In [31]:
def generate_possible_diphones_list(diphones):
    """
    Generate a list of all possible diphones based on the given data.
    This function is useful for creating a comprehensive set of diphone combinations for analysis.

    Args:
    diphones (list): A list of diphone objects.

    Returns:
    set: A set containing all possible unique diphone combinations.
    """
    # Create an empty set to store unique diphones
    liste_diphones = set()
    # Iterate through the diphones to form all possible combinations
    for i in range(len(diphones) - 1):
        diphone = diphones[i].text + diphones[i+1].text
        liste_diphones.add(diphone)
    return liste_diphones

In [32]:
def modif_duree(extrait, diphone, sentence_type, start_of_sentence=False, end_of_sentence=False):
    """
    Modify the duration of a sound segment (diphone).

    This function adjusts the duration of a sound segment based on its characteristics
    and position in the sentence. It uses the Praat software's capabilities through
    parselmouth to manipulate the duration tier of the sound.

    Args:
    extrait (parselmouth.Sound): The sound object to be manipulated.
    diphone (str): The diphone (pair of phonemes) represented in the sound segment.
    start_of_sentence (bool, optional): Flag indicating if the segment is at the start of a sentence.
    end_of_sentence (bool, optional): Flag indicating if the segment is at the end of a sentence.

    Returns:
    parselmouth.Sound: The sound object with modified duration.
    """
    # Convert the sound object to a Praat manipulation object
    manip = call(extrait, "To Manipulation", 0.01, 75, 600)
    # Extract the duration tier from the manipulation object
    duration_tier = call(manip, "Extract duration tier")
    # Remove existing points in the duration tier
    call(duration_tier, "Remove points between", 0, extrait.duration)

    # Apply nuanced adjustments to duration
    adjustment_factor = 0.90  # Default factor for duration adjustment
    # Elongate the duration for diphones with long vowels or elongating consonants
    if contains_long_vowel(diphone) or elongating_vowel_consonant(diphone):
        adjustment_factor = 1.00

    if sentence_type == "question":
        # Slightly elongate the duration towards the end of a question
        if end_of_sentence:
            adjustment_factor = 1.05
    elif sentence_type == "declaration":
        # Slightly shorten the duration at the end of a declaration
        if end_of_sentence:
            adjustment_factor = 0.85
    elif sentence_type == "exclamation":
        # Vary the duration more dynamically for exclamations
        adjustment_factor = 0.80

    # Apply the adjustment factor to the midpoint of the sound's duration
    call(duration_tier, "Add point", extrait.duration / 2, adjustment_factor)
    # Replace the original duration tier with the modified one
    call([manip, duration_tier], "Replace duration tier")
    # Resynthesize the sound with the modified duration
    return call(manip, "Get resynthesis (overlap-add)")

In [33]:
def modif_pitch_commas(extrait, comma_adjustment=False):
    """
    Modify the pitch of a sound segment (diphone).

    This function adjusts the pitch of a sound segment based on the sentence type,
    its position in the sentence, and whether it is preceded or followed by a comma.
    It uses the Praat software's capabilities through parselmouth to manipulate
    the pitch tier of the sound.

    Args:
    extrait (parselmouth.Sound): The sound object to be manipulated.
    sentence_type (str, optional): The type of the sentence ('question', 'declaration', 'exclamation').
    comma_adjustment (bool, optional): Flag indicating if there's a comma adjustment needed (increased pitch).

    Returns:
    parselmouth.Sound: The sound object with modified pitch.
    """
    # Convert the sound object to a Praat manipulation object
    manip = call(extrait, "To Manipulation", 0.01, 75, 600)
    # Extract the pitch tier from the manipulation object
    pitch_tier = call(manip, "Extract pitch tier")

    base_pitch = 195  # Reference pitch in Hz
    base_factor = 1.0  # Base factor for pitch adjustment

    # Define factors for different sentence types and positions
    factors = {
        'comma_adjustment': 1.07,
    }

    # Adjust the pitch dynamically based on the sentence type and characteristics
    if comma_adjustment:
        call(pitch_tier, "Add point", extrait.duration, base_pitch + 20)
        call(pitch_tier, "Multiply frequencies", extrait.xmin, extrait.xmax, factors['comma_adjustment'])

    # Replace the original pitch tier with the modified one
    call([manip, pitch_tier], "Replace pitch tier")
    # Resynthesize the sound with the modified pitch
    return call(manip, "Get resynthesis (overlap-add)")

In [34]:
def modif_pitch(extrait, sentence_type, base_pitch=195):
    """
    Applique un modèle de pitch à l'ensemble d'une phrase en fonction de son type.

    Args:
    sound (parselmouth.Sound): L'objet sonore contenant la phrase complète.
    sentence_type (str): Le type de la phrase ('question', 'declaration', 'exclamation').

    Returns:
    parselmouth.Sound: L'objet sonore avec le pitch modifié.
    """
    manip = call(extrait, "To Manipulation", 0.01, 75, 600)
    pitch_tier = call(manip, "Extract pitch tier")

    # Define factors for different sentence types and positions
    factors = {
        'question_end': 1.1,
        'declaration_end': 0.9,
        'exclamation': 1.2
    }

    # Adjust the pitch dynamically based on the sentence type and characteristics
    if sentence_type == "question":
        call(pitch_tier, "Add point", extrait.duration * 0.25, base_pitch + 10)  # Nouveau point pour une montée progressive
        call(pitch_tier, "Add point", extrait.duration * 0.5, base_pitch + 20)
        call(pitch_tier, "Add point", extrait.duration * 0.75, base_pitch + 35)
        call(pitch_tier, "Add point", extrait.duration, base_pitch + 75)
        # Multiply frequencies to modify the pitch contour for the entire duration of the sentence
        # call(pitch_tier, "Multiply frequencies", extrait.xmin, extrait.xmax, factors['question_end'])

    elif sentence_type == "declaration":
        # Add a point at 75% of the duration with slightly decreased pitch for falling intonation typical of declarations
        call(pitch_tier, "Add point", extrait.duration * 0.25, base_pitch - 5)  # Nouveau point pour une descente progressive
        call(pitch_tier, "Add point", extrait.duration * 0.5, base_pitch - 10)
        call(pitch_tier, "Add point", extrait.duration * 0.75, base_pitch - 25)
        call(pitch_tier, "Add point", extrait.duration, base_pitch - 45)
        # Multiply frequencies to adjust the pitch contour for the entire sentence
        # call(pitch_tier, "Multiply frequencies", extrait.xmin, extrait.xmax, factors['declaration_end'])

    elif sentence_type == "exclamation":
        # Add a point at 50% of the duration with increased pitch for heightened intonation typical of exclamations
        call(pitch_tier, "Add point", extrait.duration * 0.25, base_pitch + 20)  # Nouveau point pour une intonation expressive
        call(pitch_tier, "Add point", extrait.duration * 0.5, base_pitch + 30)
        call(pitch_tier, "Add point", extrait.duration, base_pitch + 50)
        # Multiply frequencies to amplify the pitch contour for the entire sentence
        # call(pitch_tier, "Multiply frequencies", extrait.xmin, extrait.xmax, factors['exclamation'])

    # Replace the original pitch tier with the new one
    call([pitch_tier, manip], "Replace pitch tier")
    # Resynthesize the sound with the modified pitch
    return call(manip, "Get resynthesis (overlap-add)")

In [35]:
def determine_sentence_type(sentence):
    """
    Determine the type of a sentence based on its punctuation.

    This function categorizes a sentence as either a question or a declaration 
    based on its ending punctuation. A sentence ending with a question mark ('?') 
    is categorized as a question, while one ending with a period ('.') is categorized 
    as a declaration. If the sentence does not end with these punctuations, 
    the type is returned as None.

    Args:
    sentence (str): The sentence to be analyzed for its type.

    Returns:
    str: The type of the sentence ('question', 'declaration', or None).
    """
    if sentence.endswith("?"):
        return "question"
    elif sentence.endswith("."):
        return "declaration"
    elif sentence.endswith("!"):
        return "exclamation"
    return None

In [36]:
def ask_sentence(sentences_file):
    """
    Prompt the user to input or choose a sentence for processing.

    This function allows the user to either enter their own sentence, choose a sentence from a provided list,
    or process all sentences from a specified file. The function then converts the selected or input sentence(s)
    to SAMPA phonetic representation with appropriate liaisons and categorizes them based on their type 
    (question or declaration).

    Args:
    sentences_file (str): The file path to the file containing a list of sentences.

    Returns:
    list: A list of tuples, each containing a sentence in SAMPA representation and its type.
    """
    # Prompt the user to choose an option
    choice = input(
        "1. Enter your own sentence. \n"
        "2. Choose a sentence from the list. \n"
        "3. Process all sentences from the file sentence.txt.\n"
        "Enter your choice (1, 2, or 3): "
        )


    sentences_to_process = []

    # If the user wants to enter their own sentence
    if choice == "1":
        sentence = input("Enter a sentence: ").lower()  # Convert the sentence to lowercase
        sentence_type = determine_sentence_type(sentence)  # Determine the type of the sentence
        # Append the converted sentence and its type to the list
        sentences_to_process.append((conversion_SAMPA_avec_liaisons(sentence), sentence_type))

    # If the user wants to choose a sentence from the provided list
    elif choice == "2":
        with open(sentences_file, 'r') as file:
            sentences = file.readlines()
            # Display each sentence from the file to the user
            for i, sent in enumerate(sentences):
                print(f"{i + 1}. {sent.strip()}")
            selection = int(input(f"Choose a sentence (1-{len(sentences)}): ")) - 1
            sentence = sentences[selection].strip()  # Get the selected sentence
            sentence_type = determine_sentence_type(sentence)  # Determine the type of the sentence
            # Append the selected sentence and its type to the list
            sentences_to_process.append((conversion_SAMPA_avec_liaisons(sentence), sentence_type))

    # If the user wants to process all sentences from the file
    elif choice == "3":
        with open(sentences_file, 'r') as file:
            sentences = file.readlines()
            # Process each sentence from the file
            for sentence in sentences:
                sentence = sentence.strip()  # Remove whitespace
                sentence_type = determine_sentence_type(sentence)  # Determine the type of the sentence
                # Append each sentence and its type to the list
                sentences_to_process.append((conversion_SAMPA_avec_liaisons(sentence), sentence_type))

    # If the user enters an invalid option
    else:
        print("Invalid option. Please select 1, 2 or 3.")
        return ask_sentence(sentences_file)  # Recursively call the function for a valid input

    return sentences_to_process  # Return the list of sentences to process

In [37]:
def main():
    """
    The main function to process sentences, either entered by the user or selected from a file. 
    It handles the synthesis of speech from textual input using a diphone-based approach.
    """
    try:
        # Generate a list of all possible diphones
        all_diphones = generate_possible_diphones_list(diphones)
        
        # Ask the user to enter a sentence or choose one from the file
        sentences_to_process = ask_sentence(sentences_file)

        # Process each sentence
        for index, (sampa_sentence, sentence_type) in enumerate(sentences_to_process):
            print("\n")
            print(f"Processing sentence {index+1}: SAMPA representation - {sampa_sentence}")

            user_diphones = set()
            sound_concat = None
            for l in range(len(sampa_sentence) - 1):
                diphone1 = sampa_sentence[l]
                diphone2 = sampa_sentence[l + 1]
                diphone = diphone1 + diphone2
                user_diphones.add(diphone)

                # Extract the diphone from the sound
                extrait = extraction_diphone(diphones, diphone1, diphone2, sound)

                # Modify the duration and pitch of the extracted diphone
                if extrait:
                    extrait = modif_duree(extrait, diphone, sentence_type, start_of_sentence=(l == 0), end_of_sentence=(l == len(sampa_sentence) - 2))
                    
                    # Check if the diphone ends or starts with "_" in the middle of the sentence
                    comma_adjustment = ((diphone.endswith("_") and l != len(sampa_sentence) - 2))
                    extrait = modif_pitch_commas(extrait, comma_adjustment=comma_adjustment)

                    # Concatenate the diphones to form the complete sound
                    if sound_concat is None:
                        sound_concat = extrait
                    else:
                        sound_concat = parselmouth.Sound.concatenate([sound_concat, extrait])

            # Check for any missing diphones in the TextGrid and save the synthesized sound
            missing_diphones = user_diphones.difference(all_diphones)
            if missing_diphones:
                print(f"Diphones missing in the TextGrid for sentence {index+1}:", missing_diphones)
            else:
                print(f"All diphones for sentence {index+1} are present in the TextGrid.")
                if sound_concat:
                    sound_concat = modif_pitch(sound_concat, sentence_type)
                    # Save the synthesized sound to a file
                    file_name = f"{synthese[:-4]}_sentence_{index+1}.wav"
                    sound_concat.save(file_name, 'WAV')
                    print(f"Synthesis for sentence {index+1} complete. File saved as {file_name}")

    except Exception as e:
        print(f"Une erreur est survenue : {e}")

In [38]:
# Execute the main function
if __name__ == '__main__':
    main()



Processing sentence 1: SAMPA representation - _lotOnapORtynfRES9RbiIv@nySasAlaSal9REstival_
All diphones for sentence 1 are present in the TextGrid.
Synthesis for sentence 1 complete. File saved as ./data/results/resultat_sentence_1.wav


Processing sentence 2: SAMPA representation - _lotOnapORtyndus9RbiIv@nySasAlaSal9REstival_
All diphones for sentence 2 are present in the TextGrid.
Synthesis for sentence 2 complete. File saved as ./data/results/resultat_sentence_2.wav


Processing sentence 3: SAMPA representation - _lesitRujORnlepORSselebRAlasEzCd@lotOn_
All diphones for sentence 3 are present in the TextGrid.
Synthesis for sentence 3 complete. File saved as ./data/results/resultat_sentence_3.wav


Processing sentence 4: SAMPA representation - _lesitRujORnlepORS_selebRAmanjifik@mAlasEzCd@lotOn_
All diphones for sentence 4 are present in the TextGrid.
Synthesis for sentence 4 complete. File saved as ./data/results/resultat_sentence_4.wav


Processing sentence 5: SAMPA representation