## Anonymise ELAN files

- This script searches all capitalised words in ELAN files and creates a replacement dictionary 
- The replacement dictionary is edited manually
- The edited dictionary is used to replace the capitalised words in the files

## install missing modules and import dependencies

In [2]:
# install modules
!pip install simplejson 
!pip install beautifulsoup4
!pip install fileupload

Collecting fileupload
  Downloading fileupload-0.1.5-py2.py3-none-any.whl (6.2 kB)
Installing collected packages: fileupload
Successfully installed fileupload-0.1.5


In [None]:
import os
import re
import glob
import json
import datetime
import traitlets
import simplejson
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET

from ipywidgets import widgets
from IPython.display import display
from tkinter import Tk, filedialog


class SelectFilesButton(widgets.Button):
    """A file widget that leverages tkinter.filedialog."""

    def __init__(self):
        super(SelectFilesButton, self).__init__()
        # Add the selected_files trait
        self.add_traits(files=traitlets.traitlets.List())
        # Create the button.
        self.description = "Select Files"
        self.icon = "square-o"
        self.style.button_color = "orange"
        # Set on click behavior.
        self.on_click(self.select_files)

    @staticmethod
    def select_files(b):
        """Generate instance of tkinter.filedialog.

        Parameters
        ----------
        b : obj:
            An instance of ipywidgets.widgets.Button 
        """
        # Create Tk root
        root = Tk()
        # Hide the main window
        root.withdraw()
        # Raise the root to the top of all windows.
        root.call('wm', 'attributes', '.', '-topmost', True)
        # List of selected fileswill be set to b.value
        b.files = filedialog.askopenfilename(multiple=True)

        b.description = "Files Selected"
        b.icon = "check-square-o"
        b.style.button_color = "lightgreen"

print("+++ Ready +++")

In [4]:
# Boilerplate
import io
import fileupload


def upload():
    """Create an upload files button that prints the file name and file size.
    """

    _upload_widget = fileupload.FileUploadWidget()

    def _cb(change):
        decoded = io.StringIO(change['owner'].data.decode('utf-8'))
        filename = change['owner'].filename
        print('Uploaded `{}` ({:.2f} kB)'.format(filename, len(decoded.read()) / 2 **10))

    _upload_widget.observe(_cb, names='data')
    
    return _upload_widget

up = upload()

up

FileUploadWidget(label='Browse', _dom_classes=('widget_item', 'btn-group'))

## Select files and replacement dictionary, if there is an exiting one

In [None]:
my_button = SelectFilesButton()
my_button # This will display the button in the context of the Notebook

In [None]:
# In a different cell of the same Jupyter Notebook You can access the file list by using the following:
files = my_button.files
files = [ fi for fi in files if fi.endswith(".eaf") ]
print (files)

# out_path is where the anonymised files and the replacment dictionary will be stored
out_path = "/".join(files[0].split("/")[:-1]) + "/"

In [None]:
# load the replacement dictionary

existing_dict = False

#comment out the existing_dict below if you want to create a new replacement dictinary 
#existing_dict = r"C:\Users\barth\Documents\LDACA\jupyter_notebooks\Anonymisation_ELAN\replacements_2023_4_26_1125.json"

nameDict = {}
if existing_dict:
    with open(existing_dict, 'r', encoding="utf-8") as f:
        nameDict = json.load(f)
        print("existing_dict loaded: " + existing_dict)

print ("+++ DONE +++")

## Find the capitalised words in the ELAN files

In [None]:
lowerList = set([])
tier = ""

def add_to_lower_list(string):  # list of all lowercase words
    for word in re.findall('([a-z]+)', string[1:]):
        lowerList.add(word)

    
def distribute(annos, function):
    for anno in annos:
        
        
        if len(anno.contents) > 0:
            for annotation in anno.contents:
                function(annotation)

                
def find_context(string, word):
    wordIndex = string.find(word)
    contextString = string[max(0, wordIndex - 50) : wordIndex + 50]
    
    """if wordIndex > 1:
        contextString = " ".join(contextString.split()[1:])"""

    return contextString


def pick_this_word(string, word):
    
    #tier = [p.get('tier_id') for p in string.find_all_previous(name = 'tier')][0]
    tier = string.parent.parent.parent.parent["tier_id"]
    
    if word.lower() not in lowerList:
            
        context  = find_context(string, word)

        if word not in nameDict:
            
            nameDict[word] = {"replace_as": word, "replace_at": []}
        
        if context not in nameDict[word]["replace_at"]:
            nameDict[word]["replace_at"].append([context, "Yes", basename, tier])
              
                
def add_to_name_dict(string):
    
    # find all words with a capitalised first letter
    for word in re.findall(r'\b[A-Z][a-z]+\b', string[:]):
        pick_this_word(string, word)
    
    # find all ALLCAPS words
    for word in re.findall(r'\b[A-Z]+\b', string[1:]): 
        pick_this_word(string, word)

def multi_words(string):
    
    for word in re.findall('([A-Z][a-z]+(?=\s[A-Z])(?:\s[A-Z][a-z]+)+)', string[:]):
        #print (word)
        
        pick_this_word(string, word)

        
def filter_redundence(nameDict):
    fullList = []
    newDict = dict(nameDict)
    for k, v in nameDict.items():
        fullList.append(k)
    #print (fullList)
    for k, v in nameDict.items():
        for term in fullList:
            if (k in term.split()) and (k != term):

                for each in v["replace_at"]:
                    if term in each[0]:
                        num = v["replace_at"].index(each)
                        del v["replace_at"][num]

    for k in list(nameDict.keys()):
        if len(nameDict[k]["replace_at"]) == 0:
            del nameDict[k]
                        
                    
                        
            
        
        

    """for each in v["replace_at"]:
        for term in fullList:
            if term != k:
                if """


"""def replace_quotes(string):
    replacements = {0x2013: '--',
                0x2014: '---',
                0x201c: '"',
                0x201d: '"',
                0x2018: "'",
                0x2019: "'",
                0x2026: '...',
                0xa0: ' '}

    text = sys.stdin.read()
    for r in replacements:
        text = text.replace(chr(r), replacements[r])"""
    
        
    
        
        
for file in files:
    with open(file, "r", encoding="utf-8") as inf:
        basename = os.path.basename(file)

        soup = bs(inf, "lxml")

        annos = soup.findAll("annotation_value")

        distribute(annos, add_to_lower_list)
        
        distribute(annos, add_to_name_dict)
        
        distribute(annos, multi_words)
        
        filter_redundence(nameDict)

print (" +++ DONE +++")

## Create new replacement dictionary

In [None]:
# dir_path is the folder where all .eaf files are
# out_path is the folder where the replacement dictionary will be stored


#dir_path = "C:\\Users\\barth\\Documents\\MFA\\my_corpus\\"
#outFile = "C:\\Users\\barth\\Documents\\MFA\\my_corpus\\name_dictionary_per_item.json"
out_path = "C:\\Users\\barth\\Documents\\LDACA\\jupyter_notebooks\\Anonymisation_ELAN\\"

YEAR        = datetime.date.today().year     # the current year
MONTH       = datetime.date.today().month    # the current month
DATE        = datetime.date.today().day      # the current day
HOUR        = datetime.datetime.now().hour   # the current hour
MINUTE      = datetime.datetime.now().minute # the current minute
SECONDS     = datetime.datetime.now().second #the current second


outFile = out_path + "replacements_" + str(YEAR) + "_" + str(MONTH) + "_" + str(DATE) + "_" + str(HOUR) + str(MINUTE) + ".json"


with open(outFile, "w", encoding="utf-8") as outf:
   outf.write(simplejson.dumps(nameDict, indent=4, sort_keys=True, ensure_ascii=False)) 

print (outFile)
print("+++ DONE +++")

## Edit the replacment dictionary
Manual step: Open the dictionary in notepad and make the changes you want:

- "change_as": what the word should be changed into
- "change_at": the instances where the word is found. Change "Yes" to "No" to exclude the instance

## Load the edited dictionary and automatically change the ELAN files

In [None]:
with open(outFile, "r", encoding="utf-8") as inf:
    replacementDict = json.load(inf)
print (" +++ DONE: The replacement Dictionary got loaded +++")

In [None]:
import os
from xml.etree import ElementTree as et

for file in files:
    print (file)
    
    path = os.path.dirname(os.path.abspath(file))
    filename = os.path.basename(file)
    newFilename = filename.split(".")[0] + "_anon.eaf" 
    print (newFilename)
    print (path)
    outFile = os.path.join(path, newFilename)
 
    with open(file, "r", encoding="utf-8") as inf:
        tree = ET.parse(inf)
        root = tree.getroot()
        
        
        for each in root.iter('ANNOTATION_VALUE'):
            if each.text and (len(each.text) > 0):
                
                # all changes have to be collected and then executed together
                collectedChanges = {}
                
                for k, v in replacementDict.items():
                    if k in each.text:
                        
                        replacement = v["replace_as"]
                        
                        for changer in v["replace_at"]:
                            if (changer[0] in each.text) and (changer[1] == "Yes"):
                                collectedChanges[k] = replacement
                                
                        
                        
                        
                        """for a, b in v["replace_at"].items():
                            
                            if (a in each.text) and (b == "Yes"):
                                    collectedChanges[k] = replacement"""

                for old, new in collectedChanges.items():
                    each.text = each.text.replace(old, new)

    tree.write(outFile)

The files should now be stored in the same folder as the originals