In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import librosa as lr
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import json
import glob
import random
from IPython.display import display, Audio
from sklearn.model_selection import train_test_split

In [None]:
print(tf.__version__)

In [None]:
with open('../util/tokenizer.txt','r') as infile:
    data = json.load(infile)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)
VOCAB_SIZE = len(tokenizer.word_index)
TEXT_PADDING = 8
SAMPLING_RATE = 8000
WINDOW_SIZE = 1000 #in ms
SLIDE_STRIDE = 300 # in ms
NUM_COEFFS = 16

data = []

In [None]:
def get_transcript_paths(audio_paths):
    transcript_paths = [] 
    for audio_path in audio_paths:
        rest = audio_path.strip('../data/')
        i = rest.find('/')
        folder_num = rest[:i]
        start_at = len('../data/' + folder_num +'/audio/')
        index = audio_path.find('/',start_at)
        id = audio_path[start_at:index]
        transcript_paths.append('../data/'+folder_num+'/conversations/'+id+'.json')
    return transcript_paths

In [None]:
audio_paths = glob.glob('../data/*/audio/*/*.wav')
print(audio_paths[0])
print(len(audio_paths))
transcript_paths = get_transcript_paths(audio_paths)

In [None]:
def play_audio(clip,sr):
    display(Audio(clip,rate=sr))
    
def audio_to_mfcc(samples, n_mfcc=NUM_COEFFS, hop_length=512,n_fft=2048):
    mfcc = lr.feature.mfcc(samples, sr = SAMPLING_RATE, n_mfcc=n_mfcc, hop_length=hop_length,n_fft=n_fft).T
    return mfcc

def compute_windows(segment,agent_start_ms):
    num_words = len(segment['word_offsets_ms'])
    if(num_words == 0):
        return []
    tokens = segment['transcript'].split()
    windows = []
    window_offset_ms = segment['start_ms'] + WINDOW_SIZE
    #compute the transcript of the window
    start_index = 0
    end_index = 0
    while(start_index < num_words and segment['start_ms'] + segment['word_offsets_ms'][start_index] + segment['word_durations_ms'][start_index]< window_offset_ms-WINDOW_SIZE):
        start_index += 1
    while(end_index < num_words and segment['start_ms'] + segment['word_offsets_ms'][end_index] < window_offset_ms):
        end_index += 1
    transcript = ' '.join(tokens[start_index:end_index])
    encoded_transcript = tokenizer.texts_to_sequences([transcript])
    padded_transcript = tf.keras.preprocessing.sequence.pad_sequences(encoded_transcript, maxlen=TEXT_PADDING)
    end_ms = agent_start_ms
    windows.append((window_offset_ms,padded_transcript[0]))
    while(window_offset_ms < end_ms):
        window_offset_ms = min(end_ms, window_offset_ms + SLIDE_STRIDE)
        while(start_index < num_words and segment['start_ms'] + segment['word_offsets_ms'][start_index] + segment['word_durations_ms'][start_index] < window_offset_ms-WINDOW_SIZE):
            start_index += 1
        while(end_index < num_words and segment['start_ms'] + segment['word_offsets_ms'][end_index] < window_offset_ms):
            end_index += 1
        transcript = ' '.join(tokens[start_index:end_index])
        encoded_transcript = tokenizer.texts_to_sequences([transcript])
        padded_transcript = tf.keras.preprocessing.sequence.pad_sequences(encoded_transcript, maxlen=TEXT_PADDING)
        windows.append((window_offset_ms,padded_transcript[0]))
    return windows

def get_channel(audio, segments):
    if(len(segments) > 0):
        start_ms = segments[0][0]['start_ms']
        end_ms = segments[0][0]['start_ms'] + segments[0][0]['duration_ms']
        start_sample = lr.core.time_to_samples(start_ms/1000, sr=SAMPLING_RATE)
        end_sample = lr.core.time_to_samples(end_ms/1000, sr=SAMPLING_RATE)
        norms = np.zeros(2)
        norms[0] = np.linalg.norm(audio[0][start_sample:end_sample])
        norms[1] = np.linalg.norm(audio[1][start_sample:end_sample])
        return np.argmax(norms)
    return -1
    
def get_clip(audio,window_end_ms):
    start_ms = window_end_ms - WINDOW_SIZE
    start_sample = lr.core.time_to_samples(start_ms/1000, sr=SAMPLING_RATE)
    end_sample = lr.core.time_to_samples(window_end_ms/1000, sr=SAMPLING_RATE)        
    return audio[start_sample:end_sample]

def get_training_segments(segments):
    positive = []
    negative = []
    i = 0
    num_segments = len(segments)
    while(i < num_segments):
        if (segments[i]['duration_ms']<WINDOW_SIZE or segments[i]['caller_role'] == 'AG' or i == num_segments-1):
            i+=1
            continue
        elif (segments[i+1]['caller_role'] == 'CL'):
            negative.append((segments[i],segments[i+1],i))
            i+=1
        else:
            silent_time_ms = segments[i+1]['start_ms'] - (segments[i]['start_ms']+segments[i]['duration_ms'])
            if(silent_time_ms > 0):
                positive.append((segments[i],segments[i+1],i))
            i+=2
    return positive, negative

def get_label(window_end_ms, agent_start_ms):
    return int(window_end_ms >= agent_start_ms - 100)

In [None]:
def create_data():
    count = 0
    for audio_path, transcript_path in zip(audio_paths, transcript_paths):
        count += 1
        print("Processed ", count , " files")
        try:
            with open(transcript_path,'r') as conv_json, open(audio_path,'rb') as conv_audio:
                audio, sr = lr.load(conv_audio,sr=SAMPLING_RATE, mono=False)
                transcript = json.loads(conv_json.read())
            pos_segments, neg_segments = get_training_segments(sorted(transcript['segments'],key = lambda x:x['start_ms']))
            caller_channel = get_channel(audio,pos_segments) if len(pos_segments) > 0 else get_channel(audio, neg_segments)

            for i in range(len(pos_segments)):
                start_ms = pos_segments[i][0]['start_ms']
                windows = compute_windows(pos_segments[i][0],pos_segments[i][1]['start_ms'])
                for window_end_ms,encoded_transcript in windows:
                    clip = get_clip(audio[caller_channel],window_end_ms)
                    mfcc = audio_to_mfcc(clip)
                    label = get_label(window_end_ms,pos_segments[i][1]['start_ms'])
                    clip_address = audio_path + '_' + str(pos_segments[i][2]) + '_' + str(window_end_ms-start_ms)
                    data.append([mfcc,encoded_transcript,label,clip_address])

            for i in range(len(neg_segments)):
                start_ms = neg_segments[i][0]['start_ms']
                windows = compute_windows(neg_segments[i][0],neg_segments[i][1]['start_ms'])
                for window_end_ms, encoded_transcript in windows:
                    clip = get_clip(audio[caller_channel],window_end_ms)
                    mfcc = audio_to_mfcc(clip)
                    label = 0
                    clip_address = audio_path + '_' + str(neg_segments[i][2]) + '_' + str(window_end_ms-start_ms)
                    data.append([mfcc,encoded_transcript,label,clip_address])
        except:
            print("problem loading a file")
            continue

In [None]:
for w,t,s in [(4000,32,63)]:
    data = []
    WINDOW_SIZE = w
    TEXT_PADDING = t
    create_data()
    
    data = list(filter(lambda x: x[0].shape==(s,NUM_COEFFS) ,data))
    pos_data = list(filter(lambda x: x[2] == 1,data))
    neg_data = list(filter(lambda x: x[2] == 0, data))
    neg_data = random.sample(neg_data,2*len(pos_data))
    data_updated = pos_data + neg_data
    random.shuffle(data_updated)
    
    X_mfccs = []
    X_transcripts = []
    Y = []
    Clips = []
    for x_mfcc,x_transcript, y, clip_location in data_updated:
        X_mfccs.append(x_mfcc)
        X_transcripts.append(x_transcript)
        Y.append(y)
        Clips.append(clip_location)
        
    X_mfccs = np.array(X_mfccs)
    print(X_mfccs.shape)
    X_transcripts = np.array(X_transcripts)
    print(X_transcripts.shape)
    Y = np.array(Y)
    print(Y.shape)
    Clips = np.array(Clips)
    
    
    np.save('../util/X_mfccs_error_4000.npy',X_mfccs)
    np.save('../util/X_clips_error_4000.npy',Clips)
    np.save('../util/X_transcripts_error_4000.npy',X_transcripts)
    np.save('../util/labels_error_4000.npy',Y)
    #Save the Dataset
#     mfcc_name = '../util/X_mfccs_' + str(w) + '.npy'
#     clips_name = '../util/clips_' + str(w) + '.npy'
#     transcript_name = '../util/X_transcripts_' + str(w) + '.npy'
#     label_name = '../util/labels_' + str(w) + '.npy'
#     np.save(mfcc_name,X_mfccs)
#     np.save(clips_name,Clips)
#     np.save(transcript_name,X_transcripts)
#     np.save(label_name,Y)

In [None]:
# CREATE TRAIN DEV TEST SPLITS
# X_train_mfcc, X_test_mfcc, X_train_transcript, X_test_transcript, y_train, y_test = train_test_split(X_mfccs,X_transcripts,Y,test_size=0.2)
# print(X_train_mfcc.shape)
# print(X_train_transcript.shape)
# print(y_train.shape)

In [None]:
#NORMALIZE
# mu = np.mean(X_train_mfcc,axis=0)
# std_dev = np.std(X_train_mfcc,axis=0)
# np.save('mu.npy',mu)
# np.save('std_dev.npy',std_dev)
# X_train_mfcc -= mu
# X_test_mfcc -= mu
# X_train_mfcc /= std_dev
# X_test_mfcc /= std_dev
# print(X_train_mfcc.shape)
# X_train_mfcc = X_train_mfcc[...,np.newaxis]
# X_test_mfcc = X_test_mfcc[...,np.newaxis]