In [25]:
import numpy as np
import librosa as lr
import matplotlib.pyplot as plt
import pandas as pd
from scipy.io import wavfile

import os

In [51]:
TRAIN_DIR = "../../raw-dataset/data_splits/train"
DEV_DIR = "../../raw-dataset/data_splits/dev"

# Cleaning Transcripts

Som transcipts contains a some anamolies at the end where start time is less than previous end time. Below code drops those rows and saves the cleaned transcripts in a new file. Since those anamolies are at the end, we can safely drop them.


In [52]:
def clean_transcripts(dir_name):
    folders = sorted(os.listdir(dir_name))
    for folder in folders:
        # print(os.listdir(os.path.join(TRAIN_DIR, folder)))
        # print(folder)
        
        print(f"Cleaning {folder}")
        
        id = folder.split("_")[0]
        
        transcript = pd.read_csv(os.path.join(dir_name, folder, f"{id}_Transcript.csv"))
        # print(transcript)
        
        start_time = transcript["Start_Time"].values
        end_time = transcript["End_Time"].values

        for i in range(1, len(start_time)):
            if start_time[i] < end_time[i-1]:
                print(f"Anamoly found at index {i}")
                # drop that row
                transcript.drop(i, inplace=True)

        # save the cleaned transcript
        transcript.to_csv(os.path.join(dir_name, folder, f"{id}_Transcript_Clean.csv"), index=False)
        print("-----")

In [None]:
clean_transcripts(DEV_DIR)

# Extracting Audio Based on Transcripts

In [56]:
TRAIN_DESTINATION_DIR = "../../raw-dataset/extracted_audio/train"
DEV_DESTINATION_DIR = "../../raw-dataset/extracted_audio/dev"

In [54]:
def extract_audio_from_timestamps(src_dir, dest_dir):
    src_folders = sorted(os.listdir(src_dir))
    for folder in src_folders:
        print(f"Extracting audio for {folder}") 
        id = folder.split("_")[0]    
        transcript = pd.read_csv(os.path.join(src_dir, folder, f"{id}_Transcript_Clean.csv"))
        y, sr = lr.load(os.path.join(src_dir, folder, f"{id}_AUDIO.wav"))
        
        start_time = transcript["Start_Time"].values
        end_time = transcript["End_Time"].values
        
        audio_chunks = []
        for i in range(len(start_time)):
            start = int(start_time[i]*sr)
            end = int(end_time[i]*sr)
            audio_chunks.extend(y[start:end])

        # save the audio chunks
        audio_chunks = np.array(audio_chunks)
        
        wavfile.write(os.path.join(dest_dir, f"{id}_AUDIO.wav"), sr, audio_chunks)

In [57]:
extract_audio_from_timestamps(DEV_DIR, DEV_DESTINATION_DIR)

Extracting audio for 300_P
Extracting audio for 301_P
Extracting audio for 306_P
Extracting audio for 317_P
Extracting audio for 320_P
Extracting audio for 321_P
Extracting audio for 331_P
Extracting audio for 334_P
Extracting audio for 336_P
Extracting audio for 343_P
Extracting audio for 344_P
Extracting audio for 347_P
Extracting audio for 350_P
Extracting audio for 365_P
Extracting audio for 371_P
Extracting audio for 373_P
Extracting audio for 374_P
Extracting audio for 381_P
Extracting audio for 382_P
Extracting audio for 388_P
Extracting audio for 393_P
Extracting audio for 401_P
Extracting audio for 402_P
Extracting audio for 408_P
Extracting audio for 412_P
Extracting audio for 415_P
Extracting audio for 423_P
Extracting audio for 425_P
Extracting audio for 431_P
Extracting audio for 433_P
Extracting audio for 435_P
Extracting audio for 437_P
Extracting audio for 441_P
Extracting audio for 442_P
Extracting audio for 448_P
Extracting audio for 451_P
Extracting audio for 454_P
E