## ELAN slicer
- this notebook uses ELAN .eaf files to slice the audio into small junks which can be handled fast by CPU base automatic alignment in audiotorch
- the wav file gets sliced into small snippets
- a json file with the small transcript for each wav snippet is also created 


In [2]:
!pip install pydub

Collecting pydub
  Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [3]:
import os
import sys
import json
from pydub import AudioSegment
import xml.etree.ElementTree as ET

!pip install pympi-ling
!{sys.executable} -m pip install pydub



# Automatically set up required folders 
- input  # all source ELAN and wav files go in here for processing. The files stay unchanged
- media_snippets # will hold all the wav snippets for processing
- output # the final result ELAN files will be in here

In [4]:
folder_path = 'C:\\Users\\barth\\gits\\pytorch_wav2vec\\test_data\\'

def create_folder(base_folder, folderName):
    new_folder = os.path.join(base_folder, folderName)

    if not os.path.exists(new_folder):
        os.makedirs(new_folder)
        print(f"Folder '" + folderName + f"' created inside '{base_folder}'.")
    else:
        print("The '" + folderName + f"' folder already exists inside '{base_folder}'.")

# create the basic folder structure withing the folder_path
create_folder(folder_path, "input")
create_folder(folder_path, "media_snippets")
create_folder(folder_path, "output")


# set the path variables
inputPath = folder_path + "input\\"
mediaPath = folder_path + "media_snippets\\"
outputPath = folder_path + "output\\"


The 'input' folder already exists inside 'C:\Users\barth\gits\pytorch_wav2vec\test_data\'.
The 'media_snippets' folder already exists inside 'C:\Users\barth\gits\pytorch_wav2vec\test_data\'.
The 'output' folder already exists inside 'C:\Users\barth\gits\pytorch_wav2vec\test_data\'.


## create json file with objects for each annotation

In [6]:

# Function to parse TIME_ORDER elements
def parse_time_order(root):
    time_slots_dict = {}
    for time_slot_elem in root.iter('TIME_SLOT'):
        
        time_slot_id = time_slot_elem.attrib.get('TIME_SLOT_ID')
        time_value = time_slot_elem.attrib.get('TIME_VALUE')

        if time_slot_id is not None and time_value is not None:
            time_slots_dict[time_slot_id] = time_value
            
    return time_slots_dict


# Function to parse annotation elements
def parse_annotations(root, time_slots_dict):
    
    data = {}

    # 'TIER' is the parent tag of 'ANNOTATION' elements
    for tier_elem in root.iter('TIER'):

        for annotation_elem in tier_elem.iter('ANNOTATION'):
            
            for alignable_annotation_elem in annotation_elem.iter('ALIGNABLE_ANNOTATION'):
       
                anno = {"tiername": tier_elem.attrib["TIER_ID"], 
                        "annoID": alignable_annotation_elem.attrib['ANNOTATION_ID'], 
                        "timeSlotRef1": alignable_annotation_elem.attrib['TIME_SLOT_REF1'], 
                        "timeSlotRef2": alignable_annotation_elem.attrib['TIME_SLOT_REF2'], 
                        "timeStamp1": int(time_slots_dict.get(alignable_annotation_elem.attrib['TIME_SLOT_REF1'])), 
                        "timeStamp2": int(time_slots_dict.get(alignable_annotation_elem.attrib['TIME_SLOT_REF2'])), 
                        "text": alignable_annotation_elem.find('ANNOTATION_VALUE').text,
                        "words": [],
                        "phonemes": []
                       }
                
                data[anno["tiername"] + "_" + anno["annoID"]] = anno 
    return data


def parse_xml(xml_file_path):
    
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # get all timeslots into a dictionary
    time_slots_dict = parse_time_order(root)

    # get all annotations
    data = parse_annotations(root, time_slots_dict)

    json_file_path = os.path.splitext(eaf_file_path)[0] + '.json'

    
    folder_path = os.path.dirname(xml_file_path)
    file_name = os.path.basename(xml_file_path)
    json_file_path = mediaPath + file_name[:-4] + ".json"
    
    with open(json_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=2)



# Process each .eaf file in the folder
for filename in os.listdir(folder_path + "/input/"):
    
    if filename.endswith('.eaf'):
        eaf_file_path = os.path.join(folder_path + "/input/", filename)
        print(eaf_file_path)
        parse_xml(eaf_file_path)

print ("+++ DONE +++")

C:\Users\barth\gits\pytorch_wav2vec\test_data\/input/Cathy_Samun_Wiliang_Buai_and_Tabudape_Chorus_20130412.eaf
C:\Users\barth\gits\pytorch_wav2vec\test_data\/input/Kadagoi_Lovinea_Rapalau_Life_Story_20130422.eaf
+++ DONE +++


In [7]:
def slice_wav_file(input_wav_path, output_wav_path, start_time, end_time):
    # Load the entire audio file
    audio = AudioSegment.from_wav(input_wav_path)

    # Slice the audio based on start_time and end_time
    sliced_audio = audio[start_time:end_time]

    # Export the sliced audio to a new file
    sliced_audio.export(output_wav_path, format="wav")

def read_json_file(filename):
    try:
        with open(filename, 'r') as json_file:
            data = json.load(json_file)
            return data
    except Exception as e:
        print(f"Error reading JSON file: {e}")
        return None

for filename in os.listdir(mediaPath):
    
    counter  = 0
    if filename.endswith('.json'):
        
        newFolder = folder_path + "media_snippets\\"
        
        create_folder(newFolder, filename[:-5])
        
        json_data = read_json_file(mediaPath + filename)
        
        input_wav_path = inputPath + filename[:-5] + ".wav"
        #print (audio_file)
        for k, v in json_data.items():
            print (v)
            

            snippetname = k
            snippetname = snippetname.replace(":", "_").replace(" ", "_")
            output_wav_path = newFolder + filename[:-5] + "\\" + snippetname + ".wav"
            start = v["timeStamp1"]
            end = v["timeStamp2"]
            print (output_wav_path, start, end)
        
            slice_wav_file(input_wav_path, output_wav_path, start, end)

print ("+++ DONE +++")


Folder 'Cathy_Samun_Wiliang_Buai_and_Tabudape_Chorus_20130412' created inside 'C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippets\'.
{'tiername': 'Cathy Samun Wiliang', 'annoID': 'a1', 'timeSlotRef1': 'ts1', 'timeSlotRef2': 'ts2', 'timeStamp1': 4730, 'timeStamp2': 7480, 'text': 'Yangau Cathy Samun ai', 'words': [], 'phonemes': []}
C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippets\Cathy_Samun_Wiliang_Buai_and_Tabudape_Chorus_20130412\Cathy_Samun_Wiliang_a1.wav 4730 7480
{'tiername': 'Cathy Samun Wiliang', 'annoID': 'a2', 'timeSlotRef1': 'ts3', 'timeSlotRef2': 'ts4', 'timeStamp1': 8860, 'timeStamp2': 10780, 'text': 'Ngahau mam yangan Daidai ai', 'words': [], 'phonemes': []}
C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippets\Cathy_Samun_Wiliang_Buai_and_Tabudape_Chorus_20130412\Cathy_Samun_Wiliang_a2.wav 8860 10780
{'tiername': 'Cathy Samun Wiliang', 'annoID': 'a3', 'timeSlotRef1': 'ts5', 'timeSlotRef2': 'ts6', 'timeStamp1': 11821, 'timeStamp2': 14853, 'text

{'tiername': 'Cathy Samun Wiliang', 'annoID': 'a31', 'timeSlotRef1': 'ts49', 'timeSlotRef2': 'ts50', 'timeStamp1': 116579, 'timeStamp2': 122575, 'text': 'Patatan malalte nagekanen mariu han matan honami  ngambi ngammalenggokai', 'words': [], 'phonemes': []}
C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippets\Cathy_Samun_Wiliang_Buai_and_Tabudape_Chorus_20130412\Cathy_Samun_Wiliang_a31.wav 116579 122575
{'tiername': 'Cathy Samun Wiliang', 'annoID': 'a33', 'timeSlotRef1': 'ts51', 'timeSlotRef2': 'ts52', 'timeStamp1': 124050, 'timeStamp2': 130867, 'text': 'Do gamoni ngau abate wap ngahau ab mainangan mariu han matan honami milo ngamaninggo', 'words': [], 'phonemes': []}
C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippets\Cathy_Samun_Wiliang_Buai_and_Tabudape_Chorus_20130412\Cathy_Samun_Wiliang_a33.wav 124050 130867
{'tiername': 'Cathy Samun Wiliang', 'annoID': 'a35', 'timeSlotRef1': 'ts53', 'timeSlotRef2': 'ts54', 'timeStamp1': 131085, 'timeStamp2': 133745, 'text': 'Mari

{'tiername': 'Kadagoi Lovinea Rapalau', 'annoID': 'a11', 'timeSlotRef1': 'ts21', 'timeSlotRef2': 'ts22', 'timeStamp1': 39088, 'timeStamp2': 42278, 'text': 'Malal main ngammeninge haun malal turan ngamawe ngammadonge', 'words': [], 'phonemes': []}
C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippets\Kadagoi_Lovinea_Rapalau_Life_Story_20130422\Kadagoi_Lovinea_Rapalau_a11.wav 39088 42278
{'tiername': 'Kadagoi Lovinea Rapalau', 'annoID': 'a12', 'timeSlotRef1': 'ts23', 'timeSlotRef2': 'ts24', 'timeStamp1': 42308, 'timeStamp2': 43088, 'text': 'Aben haun', 'words': [], 'phonemes': []}
C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippets\Kadagoi_Lovinea_Rapalau_Life_Story_20130422\Kadagoi_Lovinea_Rapalau_a12.wav 42308 43088
{'tiername': 'Kadagoi Lovinea Rapalau', 'annoID': 'a13', 'timeSlotRef1': 'ts25', 'timeSlotRef2': 'ts26', 'timeStamp1': 43668, 'timeStamp2': 44625, 'text': 'Ngam madonge', 'words': [], 'phonemes': []}
C:\Users\barth\gits\pytorch_wav2vec\test_data\media_snippe