In [3]:
import numpy as np
import librosa as lr
import matplotlib.pyplot as plt
import pandas as pd
from scipy.io import wavfile

import os
import shutil

In [2]:
TRAIN_DIR = "../../raw-dataset/data_splits/train"
DEV_DIR = "../../raw-dataset/data_splits/dev"
TEST_DIR = "../../raw-dataset/data_splits/test"

# Cleaning Transcripts

Som transcipts contains a some anamolies at the end where start time is less than previous end time. Below code drops those rows and saves the cleaned transcripts in a new file. Since those anamolies are at the end, we can safely drop them.


In [7]:
def clean_transcripts(dir_name):
    folders = sorted(os.listdir(dir_name))
    for folder in folders:
        # print(os.listdir(os.path.join(TRAIN_DIR, folder)))
        # print(folder)
        
        print(f"Cleaning {folder}")
        
        id = folder.split("_")[0]
        
        transcript = pd.read_csv(os.path.join(dir_name, folder, f"{id}_Transcript.csv"))
        # print(transcript)
        
        start_time = transcript["Start_Time"].values
        end_time = transcript["End_Time"].values

        for i in range(1, len(start_time)):
            if start_time[i] < end_time[i-1]:
                print(f"Anamoly found at index {i}")
                # drop that row
                transcript.drop(i, inplace=True)

        # save the cleaned transcript
        transcript.to_csv(os.path.join(dir_name, folder, f"{id}_Transcript_Clean.csv"), index=False)
        print("-----")

# Extract all Transcripts

In [8]:
def copy_clean_transcripts(dir_name, DEST_DIR):
    if not os.path.exists(DEST_DIR):
        os.makedirs(DEST_DIR)
    folders = sorted(os.listdir(dir_name))
    for folder in folders:
        
        print(f"Copying {folder}")
        
        id = folder.split("_")[0]
        
        shutil.copyfile(os.path.join(dir_name, folder, f"{id}_Transcript_Clean.csv"), os.path.join(DEST_DIR, f"{id}_Transcript.csv"))

In [11]:
DEST_DIR = "../../raw-dataset/all_transcripts/test"

copy_clean_transcripts(TEST_DIR, DEST_DIR)

Copying 600_P
Copying 602_P
Copying 604_P
Copying 605_P
Copying 606_P
Copying 607_P
Copying 609_P
Copying 615_P
Copying 618_P
Copying 619_P
Copying 620_P
Copying 622_P
Copying 623_P
Copying 624_P
Copying 625_P
Copying 626_P
Copying 629_P
Copying 631_P
Copying 634_P
Copying 635_P
Copying 636_P
Copying 637_P
Copying 638_P
Copying 640_P
Copying 649_P
Copying 650_P
Copying 651_P
Copying 652_P
Copying 655_P
Copying 656_P
Copying 658_P
Copying 659_P
Copying 661_P
Copying 663_P
Copying 664_P
Copying 666_P
Copying 669_P
Copying 676_P
Copying 679_P
Copying 682_P
Copying 683_P
Copying 688_P
Copying 689_P
Copying 691_P
Copying 693_P
Copying 696_P
Copying 699_P
Copying 705_P
Copying 708_P
Copying 709_P
Copying 710_P
Copying 712_P
Copying 715_P
Copying 716_P
Copying 717_P
Copying 718_P


In [10]:
clean_transcripts(TEST_DIR)

Cleaning 600_P
-----
Cleaning 602_P
Anamoly found at index 54
-----
Cleaning 604_P
-----
Cleaning 605_P
-----
Cleaning 606_P
-----
Cleaning 607_P
-----
Cleaning 609_P
-----
Cleaning 615_P
-----
Cleaning 618_P
-----
Cleaning 619_P
-----
Cleaning 620_P
-----
Cleaning 622_P
-----
Cleaning 623_P
-----
Cleaning 624_P
-----
Cleaning 625_P
-----
Cleaning 626_P
-----
Cleaning 629_P
-----
Cleaning 631_P
Anamoly found at index 56
-----
Cleaning 634_P
-----
Cleaning 635_P
-----
Cleaning 636_P
-----
Cleaning 637_P
-----
Cleaning 638_P
-----
Cleaning 640_P
-----
Cleaning 649_P
-----
Cleaning 650_P
-----
Cleaning 651_P
-----
Cleaning 652_P
-----
Cleaning 655_P
-----
Cleaning 656_P
-----
Cleaning 658_P
-----
Cleaning 659_P
-----
Cleaning 661_P
Anamoly found at index 167
-----
Cleaning 663_P
-----
Cleaning 664_P
Anamoly found at index 115
-----
Cleaning 666_P
-----
Cleaning 669_P
-----
Cleaning 676_P
Anamoly found at index 65
-----
Cleaning 679_P
Anamoly found at index 153
Anamoly found at index 154
-

# Extracting Audio Based on Transcripts

In [13]:
TRAIN_DESTINATION_DIR = "../../extracted_audio/train"
DEV_DESTINATION_DIR = "../../extracted_audio/dev"
TEST_DESTINATION_DIR= "../../extracted_audio/test"

dirs = [TRAIN_DESTINATION_DIR, DEV_DESTINATION_DIR, TEST_DESTINATION_DIR]

In [14]:
def extract_audio_from_timestamps(src_dir, dest_dir):
    src_folders = sorted(os.listdir(src_dir))
    for folder in src_folders:
        print(f"Extracting audio for {folder}") 
        id = folder.split("_")[0]    
        transcript = pd.read_csv(os.path.join(src_dir, folder, f"{id}_Transcript_Clean.csv"))
        y, sr = lr.load(os.path.join(src_dir, folder, f"{id}_AUDIO.wav"))
        
        start_time = transcript["Start_Time"].values
        end_time = transcript["End_Time"].values
        
        audio_chunks = []
        for i in range(len(start_time)):
            start = int(start_time[i]*sr)
            end = int(end_time[i]*sr)
            audio_chunks.extend(y[start:end])

        # save the audio chunks
        audio_chunks = np.array(audio_chunks)
        
        wavfile.write(os.path.join(dest_dir, f"{id}_AUDIO.wav"), sr, audio_chunks)
        
for dir in dirs:
    os.makedirs(dir, exist_ok=True)

In [22]:
extract_audio_from_timestamps(TEST_DIR, TEST_DESTINATION_DIR)

Extracting audio for 600_P
Extracting audio for 602_P
Extracting audio for 604_P
Extracting audio for 605_P
Extracting audio for 606_P
Extracting audio for 607_P
Extracting audio for 609_P
Extracting audio for 615_P
Extracting audio for 618_P
Extracting audio for 619_P
Extracting audio for 620_P
Extracting audio for 622_P
Extracting audio for 623_P
Extracting audio for 624_P
Extracting audio for 625_P
Extracting audio for 626_P
Extracting audio for 629_P
Extracting audio for 631_P
Extracting audio for 634_P
Extracting audio for 635_P
Extracting audio for 636_P
Extracting audio for 637_P
Extracting audio for 638_P
Extracting audio for 640_P
Extracting audio for 649_P
Extracting audio for 650_P
Extracting audio for 651_P
Extracting audio for 652_P
Extracting audio for 655_P
Extracting audio for 656_P
Extracting audio for 658_P
Extracting audio for 659_P
Extracting audio for 661_P
Extracting audio for 663_P
Extracting audio for 664_P
Extracting audio for 666_P
Extracting audio for 669_P
E