## libraries used throughout the program

In [1]:
import re
import os
from subprocess import call
import json
import pandas as pd
import sys
from pydub import AudioSegment

## this method is used to split a paragraph into sentences
- this creates one entry per sentence in the json file
- it is also used then to extract one sentence from the original mp3 file

In [2]:
def splitParagraphIntoSentences(paragraph):
    # preprocess the paragraph first before splitting into sentences
    paragraph = re.sub(r'U\.S\.', 'US', paragraph)
    paragraph = re.sub(r'Mr\.', 'Mr', paragraph)
    paragraph = re.sub(r'Mrs\.', 'Mrs', paragraph)
    paragraph = re.sub(r'Ms\.', 'Ms', paragraph)
    paragraph = re.sub(r'Operator:', 'O:', paragraph)
    paragraph = re.sub(r'\[ph\]', '', paragraph)
    paragraph = re.sub(r'\((\d+\:*)+\)', '', paragraph)

    # replace the question and answer tag with Q: and A: respectively
    paragraph = re.sub(r'\<Q.*\>:\s', 'Q: ', paragraph)
    paragraph = re.sub(r'\<A.*\>:\s', 'A: ', paragraph)
    # split the paragraph after every period (.) but only if the period is followed by a white space
    sentenceEnders = re.compile('(?<=[\.\!\?])\s')
    sentenceList = sentenceEnders.split(paragraph)

    return sentenceList

## here the text gets preprocessed the first time and stored in a new file in a subfolder
- the original file has a new line after every 11th word
- with preprocessing these newlines are being removed
- this means that one whole conversation of one person gets stored as one paragraph in the new file
## stop the execution after here to make manual adjustments to the text

In [3]:
def processTextFileToParagraph(path, text_file_name, process_folder, text_file):
    # open the text file for the current mp3 file
    with open(path + text_file_name + '.txt') as f:
        # create a new file for the preprocessed text
        with open(process_folder + text_file + 'PreprocessToParagraphs.txt', "w") as f1:
            # while reading the first file write in the second
            for line in f:
                line.replace(".", ". ")
                # only remove newline characters if the line is not empty
                if line not in ['\n', '\r\n']:
                    f1.write(' ')
                    f1.write(line.rstrip('\n'))
                else:
                    f1.write(line)

## here the text gets processed a second time and stored again in a new file in the same folder
- this step is splitting the paragraph back to sentences
- the json file will include then one sentence per entry

In [4]:
def processTextFileToSentences(process_folder, text_file):
    # open the just created file with processed text
    with open(process_folder + text_file + "PreprocessToParagraphs.txt") as f:
        # create another file for the second preprocessing step
        with open(process_folder + text_file + 'PreprocessToSentences.txt', "w") as f1:
            for line in f:
                not_needed = ['MANAGEMENT DISCUSSION SECTION',
                             'Wire: Bloomberg Transcripts',
                             'CallSource: ',
                             'For more event information and transcripts',
                             'Q&A',
                             'This transcript may not be 100 percent',
                             'Copyright (c)',
                             'END OF STORY']
                if not any(x in line for x in not_needed) or ' Q&A' in line:
                    # split paragraph into sentences
                    sentences = splitParagraphIntoSentences(line)
                    for s in sentences:
                        if s is not '':
                            # remove double white spaces from the text
                            f1.write(' '.join(s.split()) + '\n')

## This method creates the json file for the earlier specified original voice mp3
#### with the help of aeneas the mp3 gets analyzed and aligned with the preprocessed sentences from the file

In [5]:
def createJSON(path, file, process_folder, text_file):
    # call the aeneas library and create the json file
    call(["python",
          "-m", 
          "aeneas.tools.execute_task",
          path + "/" + file, # original mp3 file
          process_folder + text_file + "PreprocessToSentences.txt", # sentences for the mp3 file
          "task_language=eng|os_task_file_format=json|is_text_type=plain",
          process_folder + text_file + ".json"]) # json file containing the sentences and begin and end time

## This method loops through the json file again and extract the begin and end time of each sentence
- round to 2 decimals
## the filename contains the original filename and the created id in the json file to keep the files apart and prevent overwriting
- also a maximum difference between begin and end time of each sentence gets calculated for future purposes

- this script runs for quite a long time creating small mp3 files for each sentence

In [6]:
def createMP3withJSON(path, file, mp3_folder, process_folder, text_file):
    # open the json file
    with open(process_folder + text_file + ".json") as json_file:
        json_object = json.load(json_file)
        # transform the json file to a dataframe using pandas
        df = pd.DataFrame.from_dict(json_object, orient='columns')
        maxDiff = 0
        # for each sentence
        call = AudioSegment.from_mp3(path + file)
        # loop over the file length
        # as long as the begin time is smaller than the length of the file
        for sentences in df['fragments']:
            # skip entries with 0 seconds of sound
            if sentences['lines'][0]:
                sys.stdout.write("\r" + file + " File ID: " + sentences['id'])
                sys.stdout.flush()
                # read start and end time
                timeBegin = float(sentences['begin']) * 1000
                timeEnd = float(sentences['end']) * 1000
                # create a little segment of the file
                sentence = call[timeBegin:timeEnd]
                # export this segment to a new file
                sentence.export(mp3_folder + text_file + "_" + sentences['id'] + ".mp3", format="mp3")
        print(" - Done")
        

# start the process 
## extract the mp3 file names from the defined path
- the path containing all the mp3 files
- get the filename from the path
- path contains the path to the audio files
- file contains the name of file
- replace_strings contains everything what the filename of the audio contains, but not the filename of the respective text file
- the 2 quotes before and after the text_file are to be used in case the filename of the text file contains something the audio file does not

In [7]:
# path to the original mp3 file
path = 'BT/'
for files in os.listdir(path):
    if files.endswith(".mp3"): 
        # get the complete path to the file
        file = os.path.basename(files)
        # by default remove white spaces from the text file name
        text_file = (os.path.splitext(file)[0]).lower().replace(" ", "")

        # if there is something what needs to be removed from the name
        # remember to write things lower case here
        replace_strings = ["nokia","-","20"] 
        for replace_string in replace_strings:
            if replace_string:
                # only remove other things if stated
                text_file = text_file.replace(replace_string, "")
        # add aditional information in case something got replaced
        text_file = "" + text_file + ""

        # create a subfolder for the original mp3 file containing the filename if the folder does not yet exist
        process_folder = path + text_file + '/'
        mp3_folder = process_folder + "mp3/"
        if not os.path.exists(process_folder):
            os.makedirs(process_folder) 
        if not os.path.exists(mp3_folder):
            os.makedirs(mp3_folder)

        # add possible extra text for the filename otherwise leave it empty
        text_file_name = "" + text_file + ""

        # call the preprocessing for each mp3 file
        processTextFileToParagraph(path, text_file_name, process_folder, text_file)
        processTextFileToSentences(process_folder, text_file)

        # create the json for this mp3 file
        createJSON(path, file, process_folder, text_file)

        # split the mp3 into sentences
        createMP3withJSON(path, file, mp3_folder, process_folder, text_file)
        

btq416call.mp3 File ID: f000443 - Done
btq416qna.mp3 File ID: f000448 - Done
