# ELAN Cleaner
##### read the annotations from all elan files in a folder and applies some user defined cleaning rules

In [1]:
!pip install pympi-ling



In [1]:
import os
import re
import csv
from pympi import Eaf
import pandas as pd
import unicodedata, itertools, sys
import xml.etree.ElementTree as ET


# Sydney Speaks data
input_folder = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\SydneySpeaks\\Elan_orig"
output_folder = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\SydneySpeaks\\Elan_clean"
statsFile = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\wordCount_SydneySpeaks.csv"
cleanerFile = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\cleaner_SydneySpeaks.json"
"""
# Accented Australian English data
input_folder = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\Accented_English\\Elan_orig"
output_folder = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\Accented_English\\Elan_clean"
statsFile = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\wordCount_Accented_English.csv"
cleanerFile = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\cleaner_Accented_English.json"

# test data
input_folder = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\test_ELAN"
output_folder = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\test_ELAN_clean"
statsFile = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\wordCount_SydneySpeaks.csv"
cleanerFile = "C:\\Users\\barth\\Documents\\LDACA\\data_cleaning\\data\\cleaner_test.json"
"""

letterDict = {}


def clean_speaker(speaker):
    
    # clean the speaker tag in each tier
    
    speaker = speaker.replace("LaBB-CAT_", "")
    speaker = speaker.strip()
    
    return speaker


def clean_text(text):
    
    # text is the content of the annotation
    
    text = text.strip()
    text = text.replace("‚Äô", "'")
    text = text.replace("…", "...")
    text = text.replace("’", "'")
    text = text.replace("‘", "'")
    text = text.replace("µ", "")
    
    return text


def countLetters(text):
    # create a dictionary of all letters with frequency and examples:
    
    #  {'ɨ':[3, ' | [hə̝ɨps] | ((ˈmæl.bɨn)) | ((ˈmæəl.bɨn))']}
    
    if text and (len(text) > 0):
        words = text.split()
        
        for word in words:
            for letter in word:
                
                if letter not in letterDict:
                    letterDict[letter] = [0, ""]
                    
                letterDict[letter][0] += 1
                
                if len(letterDict[letter][1]) < 80:
                    if word not in letterDict[letter][1]:
                        letterDict[letter][1] = letterDict[letter][1] + " | " + word

                        
                        
def handle_file(file):
    input_file_path = os.path.join(input_folder, file)
    output_file_path = os.path.join(output_folder, file)
    
    print (input_file_path)
    
    tree = ET.parse(input_file_path)
    root = tree.getroot()
    
    
    # clean the text inside the annotations
    tiers = root.findall(".//TIER")
    
    for tier in tiers:
        
        
        # Filter bty tiers
        if tier.attrib["TIER_ID"].startswith("INT:") or tier.attrib["TIER_ID"].startswith("PNT:"):
            annotation_values = tier.findall(".//ANNOTATION_VALUE")

            for each in annotation_values:

                text = each.text


                if text and (len(text) > 1):

                    # remove double spaces
                    text = " ".join(text.split())

                    # remove unwanted characters
                    text = clean_text(text)

                    countLetters(text)

                    each.text = text

                
    # clean the tier names
    # make changes to the tier name
    # tiers = root.findall(".//TIER")
    #for tier in tiers:
    #    tier.attrib["TIER_ID"] = clean_speaker(tier.attrib["TIER_ID"])
        
    tree.write(output_file_path, encoding="utf-8", xml_declaration=True)
    
  
# Function to read and save CSV files
def read_and_save_csv(input_folder, output_folder):

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.eaf'):
                handle_file(file)

            
# Call the function to read and save CSV files
read_and_save_csv(input_folder, output_folder)

print (" +++ DONE +++")

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_032_Camila.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_063_Daisy.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_065_Bernice.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_073_Marjorie.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_075_Agnes.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_079_Gladys.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_087_Grace.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_102_Evelyn.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_104_Mable.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_117_Audrey.eaf
C:\Users\barth\Documents\LDACA\

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_029_SimonSSDS.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_030_George.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_804_Ben.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_822_Jack.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_824_Andrew.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_837_Damasus.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_846_Carlo.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_847_John.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_848_Franco.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_857_PatrickSSDS.eaf
C:\Users\barth\Documents\LDAC

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_104_Rhys.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_106_Larry.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_107_Reid.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_115_Nate.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_122_Nicholas.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_124_Craig.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_001_Sally.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_002_Karmen.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_003_Caroline.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_004_Charlotte.eaf
C:\Users\barth\Documents\LDACA\d

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYF_151_Roseta.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_068_Pietro.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_078_Pablo.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_079_Emilio.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_114_Samuel.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_129_Manuel.eaf
C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_137_Stefano.eaf
 +++ DONE +++


### Take a look at the content and safe clean files

In [2]:
import json 

sortednames=sorted(letterDict.keys(), key=lambda x:x.lower())

for each in sortednames:
    print (each + "\t" + str(letterDict[each][0]) + "\t" + str(letterDict[each][1]))
 

with open(cleanerFile, "w") as outfile: 
    json.dump(sortednames, outfile, indent=4, ensure_ascii = True)
    

!	38	 | !business | !both | !that | !similar | !China's | !study | !wants | !to | !for
$	1	 | $80
'	3779	 | you've | that's | what's | That's | you're | it's | You're | It's | they're | I'm
(	58	 | (unclear) | (han) | (Unclear) | (Chenzhou) | (a) | (b) | (unclear)... | (parents
)	54	 | (unclear) | (han) | (Unclear) | (Chenzhou) | (a) | (b) | (unclear)... | often)...
+	2	 | MA15+
,	330	 | like, | Wow, | Mm, | Oh, | Yeah, | school, | discount, | off, | oh, | So, | Chinese,
-	411	 | - | half-half | old-fashioned | twenty-one | O- | C- | can-do | Self-confidence
.	3375	 | you... | very... | ...easily | to... | ...and | experienced... | don't... | of...
0	137	 | 20 | 10 | 1990 | 100. | 2015 | '90s | '80s | 300 | 200 | 2006 | 0:) | 00:40:46]
1	96	 | 10 | 18 | 1990 | 15 | 21 | 100. | 2015 | MA15+ | 12 | 12hour
2	93	 | 20 | 21 | 2015 | 200 | 2006 | 12 | 12hour | 26 | 27 | 28 | 00:42:49) | 00:47:42)
3	6	 | 300
4	6	 | 00:40:46] | 00:42:49) | 00:47:42)
5	78	 | 15 | 2015 | MA15+ | 50 | 25
6	5	 | 2

## Search in files

In [2]:
def find_in_file(file):
    input_file_path = os.path.join(input_folder, file)
    print ("\n" + input_file_path)
    tree = ET.parse(input_file_path)
    root = tree.getroot()
    
    # clean the text inside the annotations
    tiers = root.findall(".//TIER")
    
    for tier in tiers:
         
        # Filter bty tiers
        if tier.attrib["TIER_ID"].startswith("INT:") or tier.attrib["TIER_ID"].startswith("PNT:"):
            
            annotation_values = tier.findall(".//ANNOTATION_VALUE")

            for each in annotation_values:

                text = each.text
                
                if text:
                    #if (text.endswith("-") or text.endswith("- ")) and not (text.endswith("--")):
                    if "¿" in text:
                    

                            print (text)

for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.eaf'):
                find_in_file(file)
                


C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_032_Camila.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_063_Daisy.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_065_Bernice.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_073_Marjorie.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_075_Agnes.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_079_Gladys.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_087_Grace.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_102_Evelyn.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_104_Mable.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\Bcnt_AEF_117_Audrey.eaf

C:\Users\barth\Docum


C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_030_George.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_804_Ben.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_822_Jack.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_824_Andrew.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_837_Damasus.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_846_Carlo.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_847_John.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_848_Franco.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_857_PatrickSSDS.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SSDS_GTM_858_AmarSSDS.eaf

C:\Users\barth\Docu


C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_104_Rhys.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_106_Larry.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_107_Reid.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_115_Nate.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_122_Nicholas.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_AYM_124_Craig.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_001_Sally.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_002_Karmen.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_003_Caroline.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_CYF_004_Charlotte.eaf

C:\Users\barth\Docume


C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYF_151_Roseta.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_068_Pietro.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_078_Pablo.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_079_Emilio.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_114_Samuel.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_129_Manuel.eaf

C:\Users\barth\Documents\LDACA\data_cleaning\data\SydneySpeaks\Elan_orig\SydS_IYM_137_Stefano.eaf


In [None]:
    """
    

        # for rows that will be deleted because they only contain "." etc. at the end of the file
        deleteList = []

        csv_reader = csv.reader(input_csv_file)

        # skip the header
        next(csv_reader)

        # Process the data as needed
        processed_data = [["Transcript","IU_number","start_time","end_time","speaker", "IU"]]
        
        for row in csv_reader:

            # clean the speaker
            row[5] = clean_speaker(row[5])

            # clean the text
            row[6] = clean_text(row[6])

            # remove all rows which only have a dot . in the UI column
            if row[6] != ".":

                rowCount += 1
                row[2] = str(rowCount)
                processed_data.append(row[1:7])


            # find the tailing cassette and period info
            if ("CASSETTE CHANGE" in row[6]) or (row[6] == "."):
                if rowCount not in deleteList:
                    deleteList.append(rowCount) 
            else:
                # reset the list if there is a row which actually has content
                deleteList = []


    # remove all tailing Cassette change and period info            
    for each in processed_data[1:]:
        if int(each[1]) in deleteList:
            processed_data.remove(each)

            
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as output_csv_file:

        csv_writer = csv.writer(output_csv_file)

        for row in processed_data:
            #print(row)
            csv_writer.writerow(row)
            
            pattern = "\w*- "
            pattern2 = " -\w*"
            if re.match(pattern, row[5]):
                print (row)
            #elif re.match(pattern2, row[5]):
            #    print (row)

"""