## libraries used throughout the program

In [8]:
import re
import os
from subprocess import call
import json
import pandas as pd
import math
import sys

## this method is used to split a paragraph into sentences
- this creates one entry per sentence in the json file
- it is also used then to extract one sentence from the original mp3 file

In [9]:
def splitParagraphIntoSentences(paragraph):
    # preprocess the paragraph first before splitting into sentences
    paragraph = re.sub(r'U\.S\.', 'US', paragraph)
    paragraph = re.sub(r'Mr\.', 'Mr', paragraph)
    paragraph = re.sub(r'Mrs\.', 'Mrs', paragraph)
    paragraph = re.sub(r'Ms\.', 'Ms', paragraph)
    paragraph = re.sub(r'Operator:', 'O:', paragraph)
    # if the paragraph starts with a Question or Answer tag (Q&A section), temporary store the name ...
#     QnA = re.findall('\<.*\>:\s', paragraph)
    paragraph = re.sub(r'\<Q.*\>:\s', 'Q: ', paragraph)
    paragraph = re.sub(r'\<A.*\>:\s', 'A: ', paragraph)
    # split the paragraph after every period what is followed by a white space
    sentenceEnders = re.compile('(?<=[\.\!\?])\s')
    sentenceList = sentenceEnders.split(paragraph)
#     if len(QnA):
        # add the name later again to restore previous structure
#         sentenceList[0] = QnA[0] + sentenceList[0]
    return sentenceList

## here the file names and paths are defined
- the original mp3 file name gets defined here and the path where the file can be found
- the path and file name are used later on again

In [10]:
# path to the original mp3 file
path = 'AUDIO/soundFiles/public/mp3/Nokia/'
# name of the original mp3 file
file = 'Nokia Q1-2017'

# by default remove white spaces from the text file name
textFile = file.lower().replace(" ", "")

# if there is something what needs to be removed from the name
# remember to wrote things lower case here
replaceStrings = ["nokia","-","20"] 
for replaceString in replaceStrings:
    if replaceString:
        # only remove other things if stated
        textFile = textFile.replace(replaceString, "")
# add aditional information in case something got replaced
textFile = "n" + textFile + ""

# create a subfolder for the original mp3 file containing the filename if the folder does not yet exist
processFolder = path + textFile + '/'
mp3_folder = processFolder + "mp3/"
if not os.path.exists(processFolder):
    os.makedirs(processFolder) 
if not os.path.exists(mp3_folder):
    os.makedirs(mp3_folder)
    
# add possible extra text for the filename otherwise leave it empty
textFileName = "" + textFile + ""

## here the text gets preprocessed the first time and stored in a new file in a subfolder
- the original file has a new line after every 11th word
- with preprocessing these newlines are being removed
- this means that one whole conversation of one person gets stored as one paragraph in the new file
## stop the execution after here to make manual adjustments to the text

In [11]:
# open the text file for the current mp3 file
with open(path + textFileName + '.txt') as f:
    # create a new file for the preprocessed text
    with open(processFolder + textFile + 'PreprocessToParagraphs.txt', "w") as f1:
        # while reading the first file write in the second
        for line in f:
            line.replace(".", ". ")
            # only remove newline characters if the line is not empty
            if line not in ['\n', '\r\n']:
                f1.write(line.rstrip('\n'))
            else:
                f1.write(line)

## here the text gets processed a second time and stored again in a new file in the same folder
- this step is splitting the paragraph back to sentences
- the json file will include then one sentence per entry

In [12]:
# open the just created file with processed text
with open(processFolder + textFile + "PreprocessToParagraphs.txt") as f:
    # create another file for the second preprocessing step
    with open(processFolder + textFile + 'PreprocessToSentences.txt', "w") as f1:
        for line in f:
            notNeeded = ['MANAGEMENT DISCUSSION SECTION',
                         'Wire: Bloomberg Transcripts',
                         'CallSource: ',
                         'For more event information and transcripts',
                         'Q&A',
                         'This transcript may not be 100 percent',
                         'Copyright (c)',
                         'END OF STORY']
            if not any(x in line for x in notNeeded) or ' Q&A' in line:
                # split paragraph into sentences
                sentences = splitParagraphIntoSentences(line)
                for s in sentences:
                    if s is not '':
                        # remove double white spaces from the text
                        f1.write(' '.join(s.split()) + '\n')

## here the json file gets created for the earlier specified original voice mp3
#### with the help of aeneas the mp3 gets analyzed and aligned with the preprocessed sentences from the file

In [13]:
# call the aeneas library and create the json file
call(["python",
      "-m", 
      "aeneas.tools.execute_task",
      path + file + ".mp3", # original mp3 file
      processFolder + textFile + "PreprocessToSentences.txt", # sentences for the mp3 file
      "task_language=eng|os_task_file_format=json|is_text_type=plain",
      processFolder + textFile + ".json"]) # json file containing the sentences and begin and end time

0

## loop through the json file again and extract the begin and end time of each sentence
- round to 2 decimals
## the filename contains the original filename and the created id in the json file to keep the files apart and prevent overwriting
- also a maximum difference between begin and end time of each sentence gets calculated for future purposes

- this script runs for quite a long time creating small mp3 files for each sentence

In [14]:
# open the json file
with open(processFolder + textFile + ".json") as json_file:
    json_object = json.load(json_file)
    # transform the json file to a dataframe using pandas
    df = pd.DataFrame.from_dict(json_object, orient='columns')
    maxDiff = 0
    # for each sentence
    for sentence in df['fragments']:
        # calculate difference between begin and end time and round to 2 decimals
        diff = math.ceil((float(sentence['end']) - float(sentence['begin'])) * 100) / 100
        # update max difference
        if diff > maxDiff:
            maxDiff = diff
        # skip entries with 0 seconds of sound
        if sentence['lines'][0]:
            sys.stdout.write("\rFile ID: " + sentence['id'])
            sys.stdout.flush()
            # create mp3 file for sentence
            call(["ffmpeg",
              "-ss",
              str(sentence['begin']), # begin time
              "-t",
              str(diff), # difference to end time
              "-i",
              path + file + ".mp3", # original mp3 file
              mp3_folder + textFile + sentence['id'] + ".mp3"]) # mp3 file for sentence
    print(" - Done")
        

File ID: f000409 - Done
