In [7]:
import numpy as np
import librosa as lr
import matplotlib.pyplot as plt
import pandas as pd
import soundfile as sf

import os
import shutil

In [2]:
TRAIN_DIR = "../../raw-dataset/data_splits/train"
DEV_DIR = "../../raw-dataset/data_splits/dev"
TEST_DIR = "../../raw-dataset/data_splits/test"

# Cleaning Transcripts

Som transcipts contains a some anamolies at the end where start time is less than previous end time. Below code drops those rows and saves the cleaned transcripts in a new file. Since those anamolies are at the end, we can safely drop them.


In [3]:
def clean_transcripts(dir_name):
    folders = sorted(os.listdir(dir_name))
    for folder in folders:
        # print(os.listdir(os.path.join(TRAIN_DIR, folder)))
        # print(folder)
        
        print(f"Cleaning {folder}")
        
        id = folder.split("_")[0]
        
        transcript = pd.read_csv(os.path.join(dir_name, folder, f"{id}_Transcript.csv"))
        # print(transcript)
        
        start_time = transcript["Start_Time"].values
        end_time = transcript["End_Time"].values

        for i in range(1, len(start_time)):
            if start_time[i] < end_time[i-1]:
                print(f"Anamoly found at index {i}")
                # drop that row
                transcript.drop(i, inplace=True)

        # save the cleaned transcript
        transcript.to_csv(os.path.join(dir_name, folder, f"{id}_Transcript_Clean.csv"), index=False)
        print("-----")

# Extract all Transcripts

In [8]:
def copy_clean_transcripts(dir_name, DEST_DIR):
    if not os.path.exists(DEST_DIR):
        os.makedirs(DEST_DIR)
    folders = sorted(os.listdir(dir_name))
    for folder in folders:
        
        print(f"Copying {folder}")
        
        id = folder.split("_")[0]
        
        shutil.copyfile(os.path.join(dir_name, folder, f"{id}_Transcript_Clean.csv"), os.path.join(DEST_DIR, f"{id}_Transcript.csv"))

In [None]:
DEST_DIR = "../../raw-dataset/all_transcripts/test"

copy_clean_transcripts(TEST_DIR, DEST_DIR)

In [None]:
clean_transcripts(TEST_DIR)

# Extracting Audio Based on Transcripts

In [4]:
TRAIN_DESTINATION_DIR = "../../extracted_audio/train"
DEV_DESTINATION_DIR = "../../extracted_audio/dev"
TEST_DESTINATION_DIR= "../../extracted_audio/test"

dirs = [TRAIN_DESTINATION_DIR, DEV_DESTINATION_DIR, TEST_DESTINATION_DIR]

In [5]:
def extract_audio_from_timestamps(src_dir, dest_dir):
    src_folders = sorted(os.listdir(src_dir))
    for folder in src_folders:
        print(f"Extracting audio for {folder}") 
        id = folder.split("_")[0]    
        transcript = pd.read_csv(os.path.join(src_dir, folder, f"{id}_Transcript_Clean.csv"))
        y, sr = lr.load(os.path.join(src_dir, folder, f"{id}_AUDIO.wav"))
        
        start_time = transcript["Start_Time"].values
        end_time = transcript["End_Time"].values
        
        audio_chunks = []
        for i in range(len(start_time)):
            start = int(start_time[i]*sr)
            end = int(end_time[i]*sr)
            audio_chunks.extend(y[start:end])

        # save the audio chunks
        audio_chunks = np.array(audio_chunks)
        
        sf.write(os.path.join(dest_dir, f"{id}_AUDIO.wav"), audio_chunks, sr)
        
for dir in dirs:
    os.makedirs(dir, exist_ok=True)

In [None]:
extract_audio_from_timestamps(TEST_DIR, TEST_DESTINATION_DIR)