## CV Hotword Similarity 5b

In [2]:
from InstructorEmbedding import INSTRUCTOR

model = INSTRUCTOR('hkunlp/instructor-large')

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [3]:
import pandas as pd
import os

cv_dev_metadata = pd.read_csv(os.path.join('..', 'asr-train', 'cv-valid-dev.csv'))
cv_dev_metadata['finetuned_text'] = cv_dev_metadata['finetuned_text'].astype(str)

cv_dev_metadata.head(5)

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,generated_text,finetuned_text
0,cv-valid-dev/sample-000000.mp3,be careful with your prognostications said the...,1,0,,,,,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...
1,cv-valid-dev/sample-000001.mp3,then why should they be surprised when they se...,2,0,,,,,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...
2,cv-valid-dev/sample-000002.mp3,a young arab also loaded down with baggage ent...,2,0,,,,,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...
3,cv-valid-dev/sample-000003.mp3,i thought that everything i owned would be des...,3,0,,,,,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...
4,cv-valid-dev/sample-000004.mp3,he moved about invisible but everyone could he...,1,0,fourties,female,england,,HE MOVED ABOUT INVISIBLE BUT EVERY ONE COULD H...,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...


Encode the hotword and text to embeddings, and use cosine similarity to generate a similarity score between phrase and sentence

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

hotword_list = ["DESTROY", "BE CAREFUL", "STRANGER"]

sentences_a = [['Represent the sentence to match: ', s] for s in cv_dev_metadata["finetuned_text"]]
sentences_b = [['Represent the phrase to find: ', hotword] for hotword in hotword_list]
embeddings_a = model.encode(sentences_a)
embeddings_b = model.encode(sentences_b)
similarities = cosine_similarity(embeddings_a,embeddings_b)

print(similarities)
print(similarities[0])

[[0.7540092  0.8992473  0.8762126 ]
 [0.7582269  0.8101653  0.8060143 ]
 [0.768912   0.7897223  0.7998812 ]
 ...
 [0.7517662  0.7955159  0.7752742 ]
 [0.7405091  0.7702166  0.7748226 ]
 [0.7662016  0.78116965 0.7726906 ]]
[0.7540092 0.8992473 0.8762126]


From our dataset, we know from cv-hotword-5a that there are some samples which have phrases exactly matching the hotwords. These samples can be labelled as true in similarity, and we can use these labelled samples to provide a good estimate on the minimum similarity score for a particular sample to be considered similar.

In [5]:
min_similarity_score = {}

for idx, hotword in enumerate(hotword_list):
    cv_dev_exactmatch_generated = cv_dev_metadata[cv_dev_metadata['finetuned_text'].str.contains(hotword, na=False)]
    similarity_scores = []
    for row_idx, row in enumerate(cv_dev_exactmatch_generated.index):
        similarity_scores.append(similarities[row][idx])
    
    min_similarity_score[hotword] = min(similarity_scores)

print(min_similarity_score)

{'DESTROY': 0.8544488, 'BE CAREFUL': 0.8992473, 'STRANGER': 0.82132894}


We iterate through all the similarity scores and find all entries that are equal to or greater than the similarity score of each exact match sample.

In [6]:
from IPython.display import display, HTML

boolean_list = []

for similarity in similarities:
    boolean_list.append(any([hotword_detect >= min_similarity_score[hotword_list[idx]] for idx, hotword_detect in enumerate(similarity)]))

cv_dev_metadata["similarity"] = boolean_list

print("Number of similar entries: {}".format(len(cv_dev_metadata[cv_dev_metadata["similarity"] == True])))
display(HTML(cv_dev_metadata[cv_dev_metadata["similarity"] == True][["generated_text", "similarity"]].head(10).to_html()))

Number of similar entries: 101


Unnamed: 0,generated_text,similarity
0,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE STRANGER,True
3,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED,True
27,HE HAD SEEN THIS SHOOTING STAR AND WAS PERSUADED THAT A METEORITE LAY SOMEWHERE NEAR BY,True
89,THE STRANGER SEEMED SATISFIED IT THE ANSWER,True
102,THEY WERE IN AN IMMENSE SETTING SURROUNDED BY THOUSANDS OF PEOPLE SPEAKING A STRANGE LANGUAGE,True
161,THE MAN LOOKED AT THE ANGEL IN SURPRISE,True
202,HOW STRANGE AFRICA IS THOUGHT THE BOY,True
203,HE HAD SEEN THE SHOOTING STAR AND WAS PERSUADED THAT A METEORITE LAY SOMEWHERE NEAR BY,True
231,HE FELT UNEASY AT THE MAN'S PRESENCE,True
261,HE DIDN'T KNOW THE MAN YET BUT HIS PRACTISED EYE WOULD RECOGNIZE HIM WHEN HE APPEARED,True


In [7]:
cv_dev_metadata.to_csv("cv-valid-dev.csv", index=False)