## Imports

In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
from torch import nn
import torch.nn.functional as F


# For audio preprocessing
import torchaudio
import IPython
import torch
from librosa.display import specshow

# For sentence embeddings
from fse import CSplitIndexedList
from gensim.models import KeyedVectors
from fse.models import SIF

In [3]:
!mkdir tmp
!mkdir data

# Download fast text word embeddings
!wget -O tmp/wordvectors.zip 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'
!unzip tmp/wordvectors.zip -d tmp/

## Dataset

In [3]:
train_data = torchaudio.datasets.LIBRISPEECH('data',url="train-clean-100", \
                                                folder_in_archive='LibriSpeech', download=True)
dev_data = torchaudio.datasets.LIBRISPEECH('data',url="dev-clean", \
                                                folder_in_archive='LibriSpeech', download=True)
test_data = torchaudio.datasets.LIBRISPEECH('data',url="test-clean", \
                                                folder_in_archive='LibriSpeech', download=True)

In [4]:
# Check sample data point in the se

i = 5 # Random index

print('Utterance - ', dev_data[i][2].lower())
print('Sample rate - ', dev_data[i][1])
print('Audio:')
IPython.display.Audio(dev_data[i][0].numpy(),rate=dev_data[i][1])

Utterance -  continue in this manner till the border is completed arranging the sippets a pale and a dark one alternately
Sample rate -  16000
Audio:


## Preprocessing audio

In [12]:
fft_length = int(0.001 * 20 * 16000) # We need FFT for every 20 milliseconds

spectrogram_preprocess = torchaudio.transforms.Spectrogram(n_fft = fft_length, normalized=True)

In [29]:
i = 5 # Random index

sample_spec = spectrogram_preprocess(dev_data[i][0]).squeeze(0).numpy()
sample_spec.shape
# plt.figure(figsize=(12, 8))

# specshow(sample_spec, sr=1600 ,x_axis = 'ms', y_axis='hz',hop_length=int(0.001 * 10 * 16000))
# plt.colorbar(format='%+2.0f dB')
# plt.title('Linear-frequency power spectrogram')

(161, 741)

## Sentence embeddings

In [30]:
# Train for first 50 sentences in dev data
data = []
for i in range(50):
    data.append(dev_data[i][2])

In [31]:
# Using crawl-300d-2M pretrained fasttext word embeddings

def split_func(string):
    return string.lower().split()

inp = CSplitIndexedList(data, custom_split=split_func)

ft = KeyedVectors.load_word2vec_format('tmp/crawl-300d-2M.vec')
model = SIF(ft, components=10)
model.train(inp)

(50, 907)

In [32]:
class search:
    def __init__(self, model, raw_data):
        self.model = model
        self.s = raw_data
        
    def get_similar(self,query, k = 10):
        return self.model.sv.similar_by_sentence(query.split(), model=model, indexable=self.s.items)[:k]
    
se = search(model,inp)

In [33]:
se.get_similar('flowers in jamaica',3)

[('IN JAMAICA IT FLOWERS ABOUT AUGUST OR SEPTEMBER FADING ABOUT THE END OF THE YEAR',
  21,
  0.4317067563533783),
 ('ILLUSTRATION GINGER', 8, 0.22672462463378906),
 ('MODE CUT UP THE ONION AND CARROT INTO SMALL RINGS AND PUT THEM INTO A STEWPAN WITH THE HERBS MUSHROOMS BAY LEAF CLOVES AND MACE ADD THE BUTTER AND SIMMER THE WHOLE VERY GENTLY OVER A SLOW FIRE UNTIL THE ONION IS QUITE TENDER',
  32,
  0.2258298099040985)]

### Bert Language Model to Extract Sentence Embeddings

In [None]:
# Clean Up and install libraries 
%%bash
pip install torch torchvision
pip install torch>=1.2.0
pip install torchaudio
pip install transformers
pip install faiss
pip install faiss-gpu
rm /content/dev-clean.tar.gz
rm /content/test-clean.tar.gz
rm -rf /content/LibriSpeech/
rm dev-clean.txt
rm test-clean.txt
wget http://www.openslr.org/resources/12/dev-clean.tar.gz
tar -xzvf dev-clean.tar.gz
wget http://www.openslr.org/resources/12/test-clean.tar.gz
tar -xzvf test-clean.tar.gz

In [None]:
import datetime
import os
import random
import textwrap
import time
import numpy as np
import pandas as pd
import faiss
import torch
from transformers import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
class lm_embedding():

  def __init__(self, datapath):
    self.datapath = datapath

  def check_cuda(self):
    if torch.cuda.is_available():      
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    return device

  def list_of_filenames(self):
    test_files = []
    dev_files = []
    for root, dirs, files in os.walk(self.datapath):
      for file in files:
        fpath = os.path.join(root, file)
        if file.endswith(".txt") and 'test-clean' in fpath:
          test_files.append(fpath)
        elif file.endswith(".txt") and 'dev-clean' in fpath:
          dev_files.append(fpath)
        else:
          pass
    return test_files, dev_files

  def text_files_merge(self, list_filenames, fname):
    with open(fname, 'w') as outfile:
      for names in list_filenames:
        with open(names) as infile:
          for i, line  in enumerate(infile):
            outfile.write(line.split(" ", 1)[1])

  def format_time(self, elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

  def text_to_embedding(self, tokenizer, model, in_text, device):
    MAX_LEN = 64
    input_ids = tokenizer.encode(
        in_text, add_special_tokens=True, max_length=MAX_LEN)
    results = pad_sequences(
        [input_ids], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    input_ids = results[0]
    attn_mask = [int (i>0) for i in input_ids]
    input_ids = torch.tensor(input_ids)
    attn_mask = torch.tensor(attn_mask)
    input_ids = input_ids.unsqueeze(0)
    attn_mask = attn_mask.unsqueeze(0)

    model.eval()
    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)

    with torch.no_grad():
      _, _, encoded_layers = model(
          input_ids=input_ids, token_type_ids=None,attention_mask=attn_mask)

    layer_i=12
    batch_i=0
    token_i=0

    vec = encoded_layers[layer_i][batch_i][token_i]
    vec = vec.detach().cpu().numpy()
    return (vec)

  def bert_embeddings(self, dataset, tokenizer, model, device):
    t0 = time.time()
    embeddings = []
    num_transcripts = len(dataset.transcript)
    print("Generating Sentence embeddings for all {:,} transcripts...".format(
        num_transcripts))
    row_num=0
    for index, row in dataset.iterrows():
      if row_num % 1000 == 0 and not row_num==0:
        elapsed = self.format_time(time.time() - t0)
        rows_per_sec = (time.time()-t0)/row_num
        remaining_sec = rows_per_sec * (num_transcripts - row_num)
        remaining = self.format_time(remaining_sec)
        print(' comment {:<7,} of {:<7}.  Elapsed: {:}. Remaining: {:}'.format(
            row_num,num_transcripts,elapsed,remaining))
      vec = self.text_to_embedding(tokenizer, model, row.transcript, device)
      embeddings.append(vec)
      row_num +=1
    return embeddings

  def similar_transcripts(self, dataset, vecs, idx_tran, ksimilar):
    wrapper = textwrap.TextWrapper(width=80)
    cpu_index = faiss.IndexFlatL2(vecs.shape[1])
    n_gpu = 1
    print("Number of available GPU: {} using {}".format(
        faiss.get_num_gpus(),n_gpu))
    co = faiss.GpuMultipleClonerOptions()
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=co, ngpu=n_gpu)
    print("Adding dataset to index")
    t0 = time.time()
    gpu_index.add(vecs)
    elapsed = time.time() - t0
    print('Building Index took %.2f seconds' %(elapsed))

    print("===Input transcript===")
    print(wrapper.fill(dataset.iloc[idx_tran].transcript))

    # Finding the 5 top most similar transcripts
    D, I = gpu_index.search(vecs[idx_tran].reshape(1, 768), k=ksimilar)
    print(D)
    print(I)
    # Printing 5 results
    for i in range(I.shape[1]):
      result_i = I[0, i]
      text = dataset.iloc[result_i].transcript
      print('Transcript #{:,}'.format(i))
      print('L2 distance: %2.f',D[0,i])
      print(wrapper.fill('"'+text+'"'))

  def single_similar_transcripts(self, vecs, sembd, dataset, ksimilar):
    wrapper = textwrap.TextWrapper(width=80)
    cpu_index = faiss.IndexFlatL2(sembd.shape[0])
    n_gpu = 1
    print("Number of available GPU: {} using {}".format(
        faiss.get_num_gpus(),n_gpu))
    co = faiss.GpuMultipleClonerOptions()
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=co, ngpu=n_gpu)
    print("Adding dataset to index")
    t0 = time.time()
    gpu_index.add(vecs)
    elapsed = time.time() - t0
    print('Building Index took %.2f seconds' %(elapsed))

    print("===Input transcript===")
    # print(wrapper.fill(dataset.iloc[idx_tran].transcript))

    # Finding the 5 top most similar transcripts
    D, I = gpu_index.search(sembd.reshape(1, 768), k=ksimilar)
    # Printing 5 results
    for i in range(I.shape[1]):
      result_i = I[0, i]
      text = dataset.iloc[result_i].transcript
      print('Transcript #{:,}'.format(i))
      print('L2 distance: %2.f',D[0,i])
      print(wrapper.fill('"'+text+'"'))

In [None]:
lm_emd = lm_embedding(datapath='/content/LibriSpeech')
test_files, dev_files = lm_emd.list_of_filenames()
lm_emd.text_files_merge(list_filenames=test_files, fname='test-clean.txt')
lm_emd.text_files_merge(list_filenames=dev_files, fname='dev-clean.txt')
dev_dataset=pd.read_csv(
    "/content/dev-clean.txt", delimiter="\t+", header=None, names=["transcript"])
test_dataset=pd.read_csv(
    "/content/test-clean.txt", delimiter="\t+", header=None, names=["transcript"])
model = BertModel.from_pretrained(
    'bert-base-uncased', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', do_lower_case=True)
device = lm_emd.check_cuda()
model.to(device)
# To check similar results on custom input search query
in_text = 'Slowly help Came to all of us'
tembd = lm_emd.text_to_embedding(tokenizer, model, in_text, device)
dev_embeddings = lm_emd.bert_embeddings(dev_dataset, tokenizer, model, device)
test_embeddings = lm_emd.bert_embeddings(test_dataset, tokenizer, model, device)
dev_vecs = np.stack(dev_embeddings)
test_vecs = np.stack(test_embeddings)
lm_emd.single_similar_transcripts(
    vecs=dev_vecs, sembd=tembd, dataset=dev_dataset, ksimilar=5)

output_dir = './content/'
# Create output directory if needed
if not os.path.exists(output_dir):
  os.mkdirs(output_dir)

# Use numpy to write out the matrix of embeddings
# Saving Embeddings
np.save('./content/dev_embeddings.npy', dev_embeddings)
np.save('./content/test_embeddings.npy', dev_embeddings)

# idx_tran is index for transcript from dev data
lm_emd.similar_transcripts(dev_dataset, dev_vecs, idx_tran=51, ksimilar=5)
lm_emd.similar_transcripts(test_dataset, test_vecs, idx_tran=10, ksimilar=5)

### Output

```
 Generating Sentence embeddings for all 2,703 transcripts...
 comment 1,000   of 2703   .  Elapsed: 0:00:18. Remaining: 0:00:31
 comment 2,000   of 2703   .  Elapsed: 0:00:36. Remaining: 0:00:13
 Generating Sentence embeddings for all 2,620 transcripts...
 comment 1,000   of 2620   .  Elapsed: 0:00:18. Remaining: 0:00:30
 comment 2,000   of 2620   .  Elapsed: 0:00:36. Remaining: 0:00:11
 Number of available GPU: 1 using 1
 Adding dataset to index
 Building Index took 0.00 seconds
 ===Input transcript===
 Transcript #0
 L2 distance: %2.f 9.326187
 "GRADUALLY RELIEF CAME TO ALL OF US"
 Transcript #1
 L2 distance: %2.f 30.17334
 "TWO DAYS AFTERWARDS OUR NOCTURNAL ORGY BEGAN TO BE TALKED OF"
 Transcript #2
 L2 distance: %2.f 30.535217
 "DEAR GOOD GOD HELP US NOW SHE PRAYED"
 Transcript #3
 L2 distance: %2.f 30.543
 "MEANWHILE TWO CHILDREN CAME TO BLESS THE HAPPY UNION OF NEWSOME AND HIS LOVELY INDIAN WIFE"
 Transcript #4
 L2 distance: %2.f 32.50032
 "WE REMAINED SEVERAL MONTHS BUT SOON WE WERE ON THE TRAMP AGAIN"

 Number of available GPU: 1 using 1
 Adding dataset to index
 Building Index took 0.00 seconds
 ===Input transcript===
 AFTER MY DEATH A PINK MARBLE STATUE OF ME WILL BE SET UP IN THE GRAND COURT WITH
 THE STATUES OF THE OTHER KINGS AND QUEENS WHO HAVE RULED THIS LAND AND ALL THE
 PINKIES IN AGES TO COME WILL THEN HONOR ME AS HAVING BEEN A JUST AND UPRIGHT
 QUEEN THAT IS MY REWARD
 Transcript #0
 L2 distance: %2.f 3.0517578e-05
 "AFTER MY DEATH A PINK MARBLE STATUE OF ME WILL BE SET UP IN THE GRAND COURT
 WITH THE STATUES OF THE OTHER KINGS AND QUEENS WHO HAVE RULED THIS LAND AND ALL
 THE PINKIES IN AGES TO COME WILL THEN HONOR ME AS HAVING BEEN A JUST AND UPRIGHT
 QUEEN THAT IS MY REWARD"
 Transcript #1
 L2 distance: %2.f 42.522858
 "I THREW MYSELF AT HIS FEET TO ASSURE HIM OF MY GRATITUDE AND EMBRACED HIM
 CALLING HIM MY FATHER"
 Transcript #2
 L2 distance: %2.f 44.142593
 "A MISFORTUNE OF BIRTH PLACED ME HERE AND I CANNOT ESCAPE MY FATE"
 Transcript #3
 L2 distance: %2.f 44.837875
 "TO BE WELCOMED INTO THE CELESTIAL RANKS OF THE HEROIC TO RISE TO THE IMMORTAL
 GODS TO THE INEFFABLE POWERS ONWARD UPWARD EVER THROUGH AGES AND THROUGH
 ETERNITIES TILL I FIND MY HOME AT LAST AND VANISH IN THE GLORY OF THE NAMELESS
 AND THE ABSOLUTE ONE"
 Transcript #4
 L2 distance: %2.f 45.176437
 "IF I LIVED AS LUXURIOUSLY AS MY PEOPLE DO AND HAD SERVANTS AND COSTLY GOWNS THE
 GOOD PINKIES WOULD SAY THAT THEIR QUEEN HAD MORE THAN THEY THEMSELVES AND IT
 WOULD BE TRUE"

 Number of available GPU: 1 using 1
 Adding dataset to index
 Building Index took 0.00 seconds
 ===Input transcript===
 I AM NOT DEPRECIATING IT WHEN I SAY THAT IN THESE TIMES IT IS NOT RARE
 Transcript #0
 L2 distance: %2.f 3.0517578e-05
 "I AM NOT DEPRECIATING IT WHEN I SAY THAT IN THESE TIMES IT IS NOT RARE"
 Transcript #1
 L2 distance: %2.f 24.789795
 "BUT I DO NOT THINK SUCH AN INFERENCE IS WARRANTED"
 Transcript #2
 L2 distance: %2.f 24.889893
 "IT IS HARDLY NECESSARY TO SAY MORE OF THEM HERE"
 Transcript #3
 L2 distance: %2.f 27.090485
 "AS I SPOKE I MADE HIM A GRACIOUS BOW AND I THINK I SHOWED HIM BY MY MODE OF
 ADDRESS THAT I DID NOT BEAR ANY GRUDGE AS TO MY INDIVIDUAL SELF"
 Transcript #4
 L2 distance: %2.f 27.540665
 "I REMAINED THERE ALONE FOR MANY HOURS BUT I MUST ACKNOWLEDGE THAT BEFORE I LEFT
 THE CHAMBERS I HAD GRADUALLY BROUGHT MYSELF TO LOOK AT THE MATTER IN ANOTHER
 LIGHT"
 ```

## Model architecture

In [145]:
class AudioToSentenceEmb(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, audio_preprocessing):
        
        super(AudioToSentenceEmb, self).__init__()
        
        self.preprocess = audio_preprocessing
        
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers = num_layers, \
                          batch_first = True, dropout = 0.5, bidirectional = True)
        
        
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        
        self.final = nn.Linear(hidden_dim, output_dim)
        
    
    def forward(self, inp, hidden = None):
        
        #inp = inp.permute(0,2,1)
        
        _, hidden_states = self.gru(inp, hidden)
        
        final_state = torch.cat([hidden_states[-2:]]).sum(dim=0)
        
        x = self.fc1(final_state)
        x = self.fc2(x)        
        return self.final(x)

In [146]:
inp_dim = int((fft_length/2) + 1)
out_dim = 300
hid_dim = 512


model = AudioToSentenceEmb(inp_dim, hid_dim, out_dim, num_layers = 2,\
                           audio_preprocessing = spectrogram_preprocess)

In [152]:
# 5 Random samples

s1 = dev_data[1][0]
s2 = dev_data[15][0]
s3 = dev_data[43][0]
s4 = dev_data[67][0]
s5 = dev_data[32][0]
l = [s1,s2,s3,s4,s5]

x=[]
for s in l:
    x.append(model.preprocess(s))

In [153]:
# Checking if the forward funtion is working properly

for s in x:
    print(model(a.permute(0,2,1)).shape)

torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])
torch.Size([1, 300])
