# Imports

In [2]:
import json
import os
import logging

from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt 
from transformers import T5Tokenizer
import numpy as np
import seaborn as sns

# Functions

In [3]:
def plot_scores_histogram(data, dataset=None, bins=10, figsize=(10,8)):
    fig, ax = plt.subplots(1, figsize=figsize)
    ax.hist(data, bins=bins, edgecolor='black')
    ax.set_xlabel('Value')
    ax.set_ylabel('Counts')
    ax.set_title(f'Coinfidence Score Histogram for {dataset}')
    return fig

def check_if_word_in_vocab(words, vocab, tokenizer):
    words_as_tokens = []
    in_vocab = []
    for word in words:
        tokens = tokenizer.tokenize(word)
        words_as_tokens.extend(tokens)
        in_vocab.extend([int(token.lower() in vocab.keys()) for token in tokens])
    return {'tokens': words_as_tokens, 'in_vocab': in_vocab}

In [4]:
class EasyData():
    def __init__(self, data_path: str):
        self.path = data_path
        self.data_list = self._load_json(data_path)
        self.num_samples = len(self.data_list)
        
        self.all_gt_words = []
        self.all_words = []
        self.all_scores = []
        self.all_rights = []
        
        self._enroll_data()
        
    def _load_json(self, path: str)-> list:
        with open(path) as fp:
            data = json.load(fp)
        assert not data is None, f"Somethings went wrong when trying to load: {self.path}"
        return data
    
    def _enroll_data(self):
        print('enrolling all data into lists: all_gt_words, all_words, all_scores, all_rights)')
        for datapoint in tqdm(self.data_list):
            self.all_gt_words.extend([word.lower() for word in datapoint['truth'].split(' ')])
            asr = datapoint['asr']
            for word, score, right in asr:
                self.all_words.append(word.lower())
                self.all_scores.append(score)
                self.all_rights.append(right)

# Tokenizer

## Load Tokenizer

In [5]:
t5tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


## Get Tokenizer's vocab

In [6]:
vocab = t5tokenizer.get_vocab()

# Explore Data

In [7]:
datasets_dict = {
    # Default
    'Default Train Clean': '../data/default/train_clean.json',
    'Default Train Other': '../data/default/train_other.json',
    'Default Dev Clean': '../data/default/dev_clean.json',
    'Default Dev Other': '../data/default/dev_other.json',
    'Default Test clean': '../data/default/test_clean.json',
    'Default Test Other': '../data/default/test_other.json',
    # Video
    'Video Train Clean': '../data/video/train_clean.json',
    'Video Train Other': '../data/video/train_other.json',
    'Video Dev Clean': '../data/video/dev_clean.json',
    'Video Dev Other': '../data/video/dev_other.json',
    'Video Test Clean': '../data/video/test_clean.json',
    'Video Test Other': '../data/video/test_other.json', 
}

In [8]:
output_path = '../data/data_exploring'

os.makedirs(output_path, exist_ok=True)

In [9]:
for ds_name, ds_path in datasets_dict.items():
    
    #make output dir
    curr_fold_name =  ds_path.split('data')[-1].split('.')[0][1:] +'/'
    curr_save_path = os.path.join(output_path, curr_fold_name)
    os.makedirs(curr_save_path, exist_ok=True)
    # Enroll all words
    data_explorer= EasyData(data_path=ds_path)
    # Coinfidence instogram
    scores_fig = plot_scores_histogram(data_explorer.all_scores, dataset=ds_name, bins=20) 
    plt.savefig(os.path.join(curr_save_path, f'confidence_histogram.png'))
    plt.close()
    
    X_words = check_if_word_in_vocab(data_explorer.all_words, vocab=vocab, tokenizer=t5tokenizer)
    X_df = pd.DataFrame.from_dict(X_words)
    X_df_count = X_df['in_vocab'].value_counts()
    X_df_count = X_df_count.reset_index()
    X_df_count['HUE'] = 'Input Words'
    
    Y_words = check_if_word_in_vocab(data_explorer.all_gt_words, vocab=vocab, tokenizer=t5tokenizer)
    Y_df = pd.DataFrame.from_dict(Y_words)
    Y_df_count = Y_df['in_vocab'].value_counts()
    Y_df_count = Y_df_count.reset_index()
    Y_df_count['HUE'] = 'Labels'
    
    concat = pd.concat([X_df_count, Y_df_count])
    mapping = {0: 'Not in Vocab', 1: 'In Vocab'}
    concat['index'] = concat['index'].map(mapping)
    ax = sns.barplot(data=concat, x="index", y="in_vocab", hue="HUE")
    ax.set_title(f'Words in T5 Vocab for {ds_name}')
    plt.savefig(os.path.join(curr_save_path, f'words_in_vocab.png'))
    plt.close()
    break

enrolling all data into lists: all_gt_words, all_words, all_scores, all_rights)


100%|██████████| 103895/103895 [00:01<00:00, 103646.06it/s]


In [10]:
curr_fold_name =  ds_path.split('data')[-1].split('.')[0][1:] +'/'
os.path.join(output_path, curr_fold_name)

'../data/data_exploring/default/train_clean/'