## Import all the required packages

In [1]:
import os
import csv
import re
import shutil
from collections import defaultdict
from collections import OrderedDict
import spacy
nlp = spacy.load('en_core_web_sm')

## Set up the directories

In [2]:
AMI_dir = os.getcwd()

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

def save_file(text, direc, filename):
    ensure_dir(direc)
    file = open(direc + filename, 'w')
    file.write(text)
    file.close()
    
ami_folder = AMI_dir + '\\ami_public_manual_1.6.2\\'
dialog_acts = AMI_dir + '\\DialogActFiles\\'
ensure_dir(dialog_acts)
decisions = AMI_dir + '\\DecisionFiles\\'
ensure_dir(decisions)

## Extract all the sentences along with their corresponding Dialogue Acts(DA's)

### Each meeting transcript will be stored with each line representing a Dialogue Act. 

### Format of each line will be: Speaker    Start Time    Sentence    DA_Class

In [3]:
def load_csv():
    '''
    This method reads through all the files in the AMI corpus and 
    saves the dialogue acts for each meeting as a text file for further processing.
    '''
    dialogs = {}  # store dialogs from the corpus
    dialog_names = []  # store file names from the corpus
    csv_corpus = []
    
    # Get every file
    dialogueAct_files = [f for f in os.listdir(ami_folder + "dialogueActs\\") if f.endswith('act.xml')]
    
    # Group speaker files by meeting
    group_speaker_files = defaultdict(list)

    for t in dialogueAct_files:
        meeting = t.split('.')[0]
        if bool(group_speaker_files[meeting]):
            group_speaker_files[meeting].append(t.split('dialog')[0])
        else:
            group_speaker_files[meeting] = [t.split('dialog')[0]]
    
    # Group transcripts by meeting
    for key in group_speaker_files.keys():
        da_speakers_filename = group_speaker_files[key]
        for da_filename in da_speakers_filename:    
            dialogs[da_filename] = OrderedDict()
            load_words(dialogs, da_filename)
            load_dialog_acts(dialogs, da_filename)
            
        csv_corpus = create_csv(dialogs)

        with open(dialog_acts + key + '.dialog-act.txt', mode='w+') as file:
            mainlist = sorted(csv_corpus, key=lambda n: (float(n[1])))
            for sublist in mainlist:
                file.write(' '.join(sublist) + '\n')
        dialogs.clear()

    print("Number of DA annotated meetings: {}".format(len(group_speaker_files.keys())))
        
def load_words(dialogs, dialog_name):
    '''
    This method reads all the meeting transcript which are stored word by word. 
    Also, cleans the word by removing any disfluencies and expanding any contractions.
    '''
    
    flag = True
    
    disfluencies = ["um", "uhm", "mm", "mmm", "ehm", "mmh", "uhmm", "ah", "hmm", "hm-hmm", "mm-hmm", "uh", "blah", "bah", "ye", 
                    "Um", "Uhm", "Mm", "Mmm", "Ehm", "Mmh", "Uhmm", "Ah", "Hmm", "Hm-hmm", "Mm-hmm", "Uh", "Blah", "Bah", "Ye",]
    
    with open(ami_folder + "words\\" + dialog_name + "words.xml") as wfile:
        for line in wfile:
            if "<w" not in line:  # not a word
                continue
            
            if 'starttime' in line:
                word_id = line.split("id=\"")[1].split("\"")[0].split(".")[-1]
                word_value = line.split(">")[1].split("<")[0]
                word_value = word_value.strip().replace("&#39;", "'")
                speaker = line.split("id=\"")[1].split('.')[1]
                
                if flag:
                    starttime = line.split("starttime=\"")[1].split("\"")[0]
                    flag = False
                
                if word_value not in disfluencies:
                    word_value = word_value.replace("'m"," am")
                    word_value = word_value.replace("'ve"," have")
                    word_value = word_value.replace("'s"," is")
                    word_value = word_value.replace("'S","Is")
                    word_value = word_value.replace("'re"," are")
                    word_value = word_value.replace("'ll"," will")
                    word_value = word_value.replace("'d"," would")
                    word_value = word_value.replace("n't"," not")
                    word_value = word_value.replace("n'"," not")
                    word_value = word_value.replace("'em","them")
                    word_value = word_value.replace("an'","and")
                    word_value = word_value.replace("'tis","it is")
                    word_value = word_value.replace("it'","it is")
                    word_value = word_value.replace("'Kay","Okay")
                    word_value = word_value.replace("'kay","okay")
                    word_value = word_value.replace("'cause","because")
                    word_value = word_value.replace("'Cause","Because")
                    word_value = word_value.replace("'cau","cause")
                    word_value = word_value.replace("'Cau","Cause")
                    word_value = word_value.replace("'til","until")
                    word_value = word_value.replace("y'all","you all")
                    word_value = word_value.replace("ya'","you")
                    word_value = word_value.replace("'bout","about")
                    word_value = word_value.replace("'Bout","About")
                    word_value = word_value.replace("'Fraid","Afraid")
                    word_value = word_value.replace("'fraid","afraid")
                    word_value = word_value.replace("'round","around")
                    word_value = word_value.replace("gonna","going to")
                    word_value = word_value.replace("wanna","want to")
                    word_value = word_value.replace("kinda","kind of")
                    word_value = word_value.replace("_","")
                    word_value = word_value.replace("..",".")
                    
                    if len(word_value) > 3:
                        word_value = word_value.replace("-"," ")
                    else:
                        word_value = word_value.replace("-","")
                    
                    dialogs[dialog_name][word_id] = []
                    dialogs[dialog_name][word_id].append(speaker)
                    dialogs[dialog_name][word_id].append(starttime)
                    dialogs[dialog_name][word_id].append(word_value)
                
            if word_value == '.':
                flag = True
                                                 
def load_dialog_acts(dialogs, dialog_name):
    '''
    This method combines all the words to form a sentence based on the annotation for dialogue acts as per AMI corpus.
    '''
    
    with open(ami_folder + "dialogueActs\\" + dialog_name + "dialog-act.xml") as actfile:
        dact = ""
        for line in actfile:
            if "<nite:pointer" in line:  # act definition
                dact = line.split("href=\"da-types.xml#id(")[1].split(")")[0]
                continue
            elif "<nite:child" in line:  # word list for this act
                ids = line.split("#")[1]
                # 4.1 get the start/stopIDs to be queried
                start_id = ids.split("..")[0].split("(")[1].split(")")[0].split("words")[1]
                # 4.2 Get the range of IDs to be queried
                try:
                    stop_id = ids.split("..")[1].split("(")[1].split(")")[0].split("words")[1]
                except:
                    stop_id = start_id 
                start_n = int(start_id)
                stop_n = int(stop_id)
                # 4. Build the query
                set = ["words" + str(i) for i in range(start_n, stop_n + 1)]
                for w in set:
                    try:
                        dialogs[dialog_name][w].append(dact)
                    except KeyError:
                        continue
    
def create_csv(dialogs):
    '''
    This method acts as a temporary stage to store all the dialogue acts for a particulat meeting.
    '''
    csv_corpus = []
    for d in dialogs:
        sentence = ""
        prevDA = "Other"
        currentDA = ""
        sentStartTime = 0
        endOfSent = ''
        currSpeaker = ''
        for word in dialogs[d]:
            try:
                speaker, starttime, word, DA = dialogs[d][word]
            except ValueError:
                continue
            if (DA != currentDA or speaker != currSpeaker or endOfSent == '.') and sentence != "":# new DA or speaker or EOL
                doc = nlp(sentence.strip())
                sentences = list(doc.sents)
                for sent in sentences:      
                    if re.search('[a-zA-Z]+', str(sent)):
                        csv_corpus.append((speaker, sentStartTime, str(sent), currentDA))
                sentence = ""
                prevDA = currentDA
            endOfSent = word
            sentStartTime = starttime
            currentDA = DA
            currSpeaker = speaker
            sentence = sentence + " " + (word.strip())
    return csv_corpus

In [4]:
load_csv()

Number of DA annotated meetings: 139


### Extract all decision transcripts in format: Speaker    Start_Time    Word    Decision (0 - False / 1 - True)

In [5]:
def load_decisions():
    
    # Get every file
    decision_files = [f for f in os.listdir(ami_folder + "decision\\manual\\")]
    decisions_words = []
    load_decisions_words(decision_files, decisions_words)
    decisions_words.sort()
    
    for files in decision_files:
        words = []
        filename = files.split('.')[0]
        words_files = [f for f in os.listdir(ami_folder + "words\\") if f.startswith(filename)]
        for file in words_files:
            load_words(file, words, decisions_words)
#         print(words)
        save_dec_file(decisions, filename, words)
    
    print("Number of Decision annotated meetings: {}".format(len(decision_files)))

def save_dec_file(decisions, filename, words):
#     print('Saving file {}.decision'.format(filename))
    with open(decisions + filename + '.decision.txt', mode='w+') as file:
        mainlist = sorted(words, key=lambda n: (float(n[1])))
        for sublist in mainlist:
            file.write(' '.join(sublist) + '\n') 
        
def load_words(file, words, decisions_words):
    
    disfluencies = ["um", "uhm", "mm", "mmm", "ehm", "mmh", "uhmm", "ah", "hmm", "hm-hmm", "mm-hmm", "uh", "blah", "bah", "ye", 
                    "Um", "Uhm", "Mm", "Mmm", "Ehm", "Mmh", "Uhmm", "Ah", "Hmm", "Hm-hmm", "Mm-hmm", "Uh", "Blah", "Bah", "Ye",]
    
    with open(ami_folder + "words\\" + file) as wfile:
        for line in wfile:
            if "<w" not in line:  # not a word
                continue
            if 'starttime' in line:
                key = line.split("id=\"")[1].split("\"")[0]
                word_value = line.split(">")[1].split("<")[0]
                word_value = word_value.strip().replace("&#39;", "'")
                starttime = line.split("starttime=\"")[1].split("\"")[0]
                speaker = line.split("id=\"")[1].split('.')[1]
                
                if word_value not in disfluencies:
                    word_value = word_value.replace("'m"," am")
                    word_value = word_value.replace("'ve"," have")
                    word_value = word_value.replace("'s"," is")
                    word_value = word_value.replace("'S","Is")
                    word_value = word_value.replace("'re"," are")
                    word_value = word_value.replace("'ll"," will")
                    word_value = word_value.replace("'d"," would")
                    word_value = word_value.replace("n't"," not")
                    word_value = word_value.replace("n'"," not")
                    word_value = word_value.replace("'em","them")
                    word_value = word_value.replace("an'","and")
                    word_value = word_value.replace("'tis","it is")
                    word_value = word_value.replace("it'","it is")
                    word_value = word_value.replace("'Kay","Okay")
                    word_value = word_value.replace("'kay","okay")
                    word_value = word_value.replace("'cause","because")
                    word_value = word_value.replace("'Cause","Because")
                    word_value = word_value.replace("'cau","cause")
                    word_value = word_value.replace("'Cau","Cause")
                    word_value = word_value.replace("'til","until")
                    word_value = word_value.replace("y'all","you all")
                    word_value = word_value.replace("ya'","you")
                    word_value = word_value.replace("'bout","about")
                    word_value = word_value.replace("'Bout","About")
                    word_value = word_value.replace("'Fraid","Afraid")
                    word_value = word_value.replace("'fraid","afraid")
                    word_value = word_value.replace("'round","around")
                    word_value = word_value.replace("gonna","going to")
                    word_value = word_value.replace("wanna","want to")
                    word_value = word_value.replace("kinda","kind of")
                    word_value = word_value.replace("_","")
                    word_value = word_value.replace("..",".")
                    
                    if len(word_value) > 3:
                        word_value = word_value.replace("-"," ")
                    else:
                        word_value = word_value.replace("-","")
                    
                    doc = nlp(word_value)
                    for token in doc:
                        if key in decisions_words:
                            words.append((speaker, starttime, str(token), '1'))
                        else:
                            words.append((speaker, starttime, str(token), '0'))
                                                      
def load_decisions_words(decision_files, decisions_words):
    
    for decision_name in decision_files:
        with open(ami_folder + "decision\\manual\\" + decision_name) as actfile:
            for line in actfile:
                if "<nite:child" in line:  # word list for this act
                    file = line.split("href=\"")[1].split("words.xml")[0]
                    ids = line.split("#")[1]
                    # 4.1 get the start/stopIDs to be queried
                    start_id = ids.split("..")[0].split("(")[1].split(")")[0].split("words")[1]
                    # 4.2 Get the range of IDs to be queried
                    try:
                        stop_id = ids.split("..")[1].split("(")[1].split(")")[0].split("words")[1]
                    except:
                        stop_id = start_id 
                    start_n = int(start_id)
                    stop_n = int(stop_id)
                    # 4. Build the query
                    set = ["words" + str(i) for i in range(start_n, stop_n + 1)]
                    
                    for w in set:
                        decisions_words.append(file + w)

In [6]:
load_decisions()

Number of Decision annotated meetings: 47


### Split the data files created above into Train and Test sets

In [7]:
decTrain = AMI_dir + '\\Dec_Train'
ensure_dir(decTrain + '\\')

decTest = AMI_dir + '\\Dec_Test'
ensure_dir(decTest + '\\')

daTrain = AMI_dir + '\\DA_Train'
ensure_dir(daTrain + '\\')

daTest = AMI_dir + '\\DA_Test'
ensure_dir(daTest + '\\')  

srcDec = AMI_dir + '\\DecisionFiles'
srcDa = AMI_dir + '\\DialogActFiles'

In [8]:
files = os.listdir(srcDec)
DecTrain = files[:40]
DecTest = files[40:47]

files = os.listdir(srcDa)
DaTrain = files[:124]
DaTest = files[124:139]

for i in DecTrain:
    shutil.copy(os.path.join(srcDec, i), decTrain)

for i in DecTest:
    shutil.copy(os.path.join(srcDec, i), decTest)

for i in DaTrain:
    shutil.copy(os.path.join(srcDa, i), daTrain)

for i in DaTest:
    shutil.copy(os.path.join(srcDa, i), daTest)