# Requirements
- aeneas
- pydub

## libraries used throughout the program

In [1]:
import re
import os
from subprocess import call, Popen
import json
import pandas as pd
import sys
from pydub import AudioSegment
from pprint import pprint
import shlex
import psycopg2

## this method is used to split a paragraph into sentences
- this creates one entry per sentence in the json file
- it is also used then to extract one sentence from the original mp3 file

In [13]:
def splitParagraphIntoSentences(paragraph):
    # preprocess the paragraph first before splitting into sentences
    paragraph = re.sub(r'U\.S\.', 'US', paragraph)
    paragraph = re.sub(r'Mr\.', 'Mr', paragraph)
    paragraph = re.sub(r'Mrs\.', 'Mrs', paragraph)
    paragraph = re.sub(r'Ms\.', 'Ms', paragraph)
    paragraph = re.sub(r'Operator:', 'O:', paragraph)
    paragraph = re.sub(r'\[ph\]', '', paragraph)
    paragraph = re.sub(r'\((\d+\:*)+\)', '', paragraph)

    # replace the question and answer tag with Q: and A: respectively
    paragraph = re.sub(r'\<Q.*\>:\s', 'Q: ', paragraph)
    paragraph = re.sub(r'\<A.*\>:\s', 'A: ', paragraph)
    # split the paragraph after every period (.) but only if the period is followed by a white space
    sentenceEnders = re.compile('(?<=[\.\!\?])\s')
    sentenceList = sentenceEnders.split(paragraph)

    return sentenceList

## here the text gets preprocessed the first time and stored in a new file in a subfolder
- the original file has a new line after every 11th word
- with preprocessing these newlines are being removed
- this means that one whole conversation of one person gets stored as one paragraph in the new file

## and then processed a second time and stored again in a new file in the same folder
- this step is splitting the paragraph back to sentences
- the json file will include then one sentence per entry

In [14]:
def processTextFile(path, process_folder, text_file):
    # open the text file for the current mp3 file
    with open(path + text_file + '.txt') as f:
        # create a new file for the preprocessed text
        with open(process_folder + text_file + 'PreprocessToParagraphs.txt', "w") as f1:
            # while reading the first file write in the second
            for line in f:
                line.replace(".", ". ")
                # only remove newline characters if the line is not empty
                if line not in ['\n', '\r\n']:
                    f1.write(' ')
                    f1.write(line.rstrip('\n'))
                else:
                    f1.write(line)
                    
    # open the just created file with processed text
    with open(process_folder + text_file + "PreprocessToParagraphs.txt") as f:
        # create another file for the second preprocessing step
        with open(process_folder + text_file + 'PreprocessToSentences.txt', "w") as f1:
            for line in f:
                not_needed = ['MANAGEMENT DISCUSSION SECTION',
                             'Wire: Bloomberg Transcripts',
                             'CallSource: ',
                             'Event Date: ',
                             'Event Description: ',
                             'Source: ',
                             'For more event information and transcripts',
                             'Q&A',
                             'This transcript may not be 100 percent',
                             'Copyright (c)',
                             'END OF STORY']
                if not any(x in line for x in not_needed) or ' Q&A' in line:
                    # split paragraph into sentences
                    sentences = splitParagraphIntoSentences(line)
                    for s in sentences:
                        if s is not '':
                            # remove double white spaces from the text
                            f1.write(' '.join(s.split()) + '\n')

## This method creates the json file for the earlier specified original voice mp3
#### with the help of aeneas the mp3 gets analyzed and aligned with the preprocessed sentences from the file

In [16]:
def createJSON(path, file, process_folder, text_file):
    # call the aeneas library and create the json file
    call(["python",
          "-m", 
          "aeneas.tools.execute_task",
          path + "/" + file, # original mp3 file
          process_folder + text_file + "PreprocessToSentences.txt", # sentences for the mp3 file
          "task_language=eng|os_task_file_format=json|is_text_type=plain",
          process_folder + text_file + ".json"]) # json file containing the sentences and begin and end time

## This method loops through the json file again and extract the begin and end time of each sentence
- round to 2 decimals

## the filename contains the original filename and the created id in the json file to keep the files apart and prevent overwriting
- also a maximum difference between begin and end time of each sentence gets calculated for future purposes

- this script runs for quite a long time creating small mp3 files for each sentence

In [17]:
def createMP3withJSON(path, file, mp3_folder, process_folder, text_file):
    # open the json file
    with open(process_folder + text_file + ".json") as json_file:
        json_object = json.load(json_file)
        # transform the json file to a dataframe using pandas
        df = pd.DataFrame.from_dict(json_object, orient='columns')
        maxDiff = 0
        # for each sentence
        call = AudioSegment.from_mp3(path + file)
        # loop over the file length
        # as long as the begin time is smaller than the length of the file
        for sentences in df['fragments']:
            # skip entries with 0 seconds of sound
            if sentences['lines'][0]:
                sys.stdout.write("\r" + file + " File ID: " + sentences['id'])
                sys.stdout.flush()
                # read start and end time
                timeBegin = float(sentences['begin']) * 1000
                timeEnd = float(sentences['end']) * 1000
                # create a little segment of the file
                sentence = call[timeBegin:timeEnd]
                # export this segment to a new file
                sentence.export(mp3_folder + text_file + "_" + sentences['id'] + ".mp3", format="mp3")
        print(" - Done")
        

### This method creates the little 3 (or something else) seconds splits of the big file

In [2]:
def splitAudio(path, file, filename, split_folder, split_interval):
    # set the begin time of the file
    timeBegin = 0.0
    # set the end time of the file
    timeEnd = split_interval * 1000
    # set the iterations for the filename
    iteration = 1
    # load the song
    call = AudioSegment.from_mp3(path + file)
    # loop over the file length
    # as long as the begin time is smaller than the length of the file
    while(timeBegin < len(call)):
        sys.stdout.write("\r" + str(split_interval) + " Seconds " + file + " Iteration: " + str(iteration))
        sys.stdout.flush()
        # create a little segment of the file
        segment = call[timeBegin:timeEnd]
        # export this segment to a new file
        segment.export(split_folder + filename + "_" + str(iteration) + ".mp3", format="mp3")
        # increase begin and end time, and interation for the next file
        timeBegin += (split_interval * 1000)
        timeEnd += (split_interval * 1000)
        iteration += 1
    print(" - Done")

### This method will add the question ID to the json file

In [None]:
def addQuestionIDToJson(path, file):
    # open the json file
    with open(path + file + ".json") as json_file:
        data = json.load(json_file)
        # loop over the file length
        # as long as the begin time is smaller than the length of the file
        QnAPartCounter = 0
        QuestionCounter = 0
        AnswerCounter = 0
        for sentences in data['fragments']:
            # skip entries with 0 seconds of sound
            if sentences['lines'][0]:
                sys.stdout.write("\r" + "Add Question ID " + sentences['sentenceMP3'] + " File ID: " + sentences['id'])
                sys.stdout.flush()
                if "O: " in sentences['lines'][0]:
                    QnAPartCounter += 1
                    QuestionCounter = 0
                    AnswerCounter = 0
                if "Q: " in sentences['lines'][0]:
                    QuestionCounter += 1
                    AnswerCounter = 0
                if "A: " in sentences['lines'][0]:
                    AnswerCounter += 1
                sentences["question_id"] = sentences['company'] + "." + sentences['yearQuartile'] + "." + str(QnAPartCounter) + "." + str(QuestionCounter) + "." + str(AnswerCounter)
        updateJSON(data, "data_sentiment.json")

### Sentimental analysis on the sentences

In [None]:
#returns the sentiment (pos, neg) when given a string.
def RateSentiment(sentiString):
    #open a subprocess using shlex to get the command line string into the correct args list format
    #p = subprocess.Popen(shlex.split("java -jar SentiStrengthCom.jar stdin sentidata C:/Python27/sentistrength/"),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    p = subprocess.Popen(shlex.split("java -jar C:/Python27/SentiStrengthCom.jar stdin sentidata C:/Python27/sentistrength/"),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    #communicate via stdin the string to be rated. Note that all spaces are replaced with +
    stdout_text, stderr_text = p.communicate(sentiString.replace(" ","+"))
    #remove the tab spacing between the positive and negative ratings. e.g. 1    -5 -> 1-5
    stdout_text = stdout_text.rstrip().replace("\t","")
    return stdout_text

In [None]:
# Gets the sentences in JSONs, applies sentiment analysis and updates the JSON objects.
def getSentencesFromJSON(jsonFilePath, company):
    data = json.load(open(jsonFilePath))
    pprint(data["fragments"][0])
    QnAPartCounter = 0
    QuestionCounter = 0
    AnswerCounter = 0
    for v in data["fragments"]:
        #print v['lines'][0]
        sentence = v['lines'][0]
        pos, neg = RateSentiment(v['lines'][0]).split('-')
        print sentence + "," + pos + "," + neg
        v['positive_sentiment'] = pos
        v['negative_sentiment'] = neg
        v['sentenceMP3'] = jsonFilePath
        v['company'] = company
        yearquart=jsonFilePath
        if 'qna' in yearquart:
            year = yearquart[yearquart.find(company + 'q')+3:yearquart.find('qna')]
        else:
            year = yearquart[yearquart.find(company + 'q')+3:yearquart.find('call')]
        v['year'] = '20' + year[1:]
        v['quartile'] = year[0]
        v['yearQuartile'] = '20' + year[1:] + year[0]
        if "O: " in sentence:
            QnAPartCounter += 1
            QuestionCounter = 0
            AnswerCounter = 0
        if "Q: " in sentence:
            QuestionCounter += 1
            AnswerCounter = 0
        if "A: " in sentence:
            AnswerCounter += 1
        v["question_id"] = company + "." + '20' + year[1:] + year[0] + "." + str(QnAPartCounter) + "." + str(QuestionCounter) + "." + str(AnswerCounter)
    updateJSON(data, "data_sentiment.json")
    #with open('data_sentiment.json', 'a') as outfile:
    #    json.dump(data, outfile, sort_keys=True, indent=4)
    #    print "\n\n\ndata file updated: " + jsonFilePath

In [None]:
# Updates a JSON file
def updateJSON(newdata, outFilePath):
    data = json.load(open(outFilePath))
    if(len(data["fragments"])==0):
        with open(outFilePath, 'w') as outfile:
            json.dump(newdata, outfile, sort_keys=True, indent=4)
            print "\n\n\ndata file created: " + outFilePath
    else:
        for v in newdata["fragments"]:
            data["fragments"].append(v)
        with open(outFilePath, 'w') as outfile:
            json.dump(data, outfile, sort_keys=True, indent=4)
            print "\n\n\n==================data file updated: " + outFilePath

In [None]:
# Walkthrough the files in the folder and subfolders
def pywalkerMP3(path, company):
    for root, dirs, files in os.walk(path):
        for file_ in files:
            if '.json' in file_:
                filename = os.path.join(root, file_) 
                print filename
                getSentencesFromJSON(filename, company)

# start the process 
## extract the mp3 file names from the defined path
- the path containing all the mp3 files
- get the filename from the path
- path contains the path to the audio files
- file contains the name of file
- replace_strings contains everything what the filename of the audio contains, but not the filename of the respective text file
- the 2 quotes before and after the text_file are to be used in case the filename of the text file contains something the audio file does not

In [18]:
# path to the original mp3 file
path = 'Nokia/process/'
# declare the size of the split
seconds_split = 3.0
for files in os.listdir(path):
    if files.endswith(".mp3"): 
        # get the complete path to the file
        file = os.path.basename(files)
        # by default remove white spaces from the text file name
        text_file = (os.path.splitext(file)[0]).lower().replace(" ", "")

        # if there is something what needs to be removed from the name
        # remember to write things lower case here
        replace_strings = ["nokia","-","20"] 
        for replace_string in replace_strings:
            if replace_string:
                # only remove other things if stated
                text_file = text_file.replace(replace_string, "")
        # add aditional information in case something got replaced
        text_file = "n" + text_file + ""

        # create a subfolder for the original mp3 file containing the filename if the folder does not yet exist
        process_folder = path + text_file + '/'
        mp3_folder = process_folder + "mp3/"
        if not os.path.exists(process_folder):
            os.makedirs(process_folder) 
        if not os.path.exists(mp3_folder):
            os.makedirs(mp3_folder)
            
        # create the split folder for the small files
        split_folder = path + "/" + filename + '/' + str(seconds_split) + "secSplit/"
        if not os.path.exists(split_folder):
            os.makedirs(split_folder)

        # call the preprocessing for each mp3 file
        processTextFile(path, process_folder, text_file)

        # create the json for this mp3 file
        createJSON(path, file, process_folder, text_file)

        # split the mp3 into sentences
        createMP3withJSON(path, file, mp3_folder, process_folder, text_file)
        
        # call the function to split the audio files in 3 sec parts
        splitAudio(path, file, text_file, split_folder, seconds_split)

        pywalkerMP3(path, 'BT')
        

Nokia Q3-2017call.mp3 File ID: f000240 - Done
Nokia Q2-2012qna.mp3 File ID: f000285 - Done
Nokia Q1-2012call.mp3 File ID: f000218 - Done
Nokia Q3-2011qna.mp3 File ID: f000324 - Done
Nokia Q4-2014qna.mp3 File ID: f000336 - Done
Nokia Q1-2013call.mp3 File ID: f000173 - Done
Nokia Q2-2017call.mp3 File ID: f000181 - Done
Nokia Q1-2014qna.mp3 File ID: f000257 - Done
Nokia Q3-2013qna.mp3 File ID: f000322 - Done
Nokia Q3-2017qna.mp3 File ID: f000341 - Done
Nokia Q2-2015call.mp3 File ID: f000221 - Done
Nokia Q1-2011call.mp3 File ID: f000225 - Done
Nokia Q4-2012qna.mp3 File ID: f000320 - Done
Nokia Q2-2014qna.mp3 File ID: f000330 - Done
Nokia Q4-2014call.mp3 File ID: f000211 - Done
Nokia Q3-2014call.mp3 File ID: f000173 - Done
Nokia Q3-2015call.mp3 File ID: f000186 - Done
Nokia Q4-2015call.mp3 File ID: f000189 - Done
Nokia Q2-2014call.mp3 File ID: f000206 - Done
Nokia Q1-2012qna.mp3 File ID: f000293 - Done
Nokia Q3-2015qna.mp3 File ID: f000412 - Done
Nokia Q1-2017qna.mp3 File ID: f000224 - Done