# This notebook process on CLEANED and Aligned csv DF files.

# Import / Setup

In [1]:
!pip install --quiet transformers jiwer torch evaluate --use-deprecated=legacy-resolver
!pip install --quiet datasets >=2.6.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio
from scipy.signal import resample
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration
import evaluate

wer  = evaluate.load('wer')

# load tokenizer, processor, feature_extractor
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
new_tokens = ['<->', '<HS>', '<SUS>', '<SUI>', '<UNK>', '<FPN>', '<SUQ>', '<CS>'] # Check if the new tokens are already in the tokenizer's vocabulary
tokenizer.add_tokens(new_tokens)

vocabs = tokenizer.get_vocab()
conflicts = [token for token in new_tokens if token in vocabs]
if conflicts:
    print(f"Conflicting tokens: {conflicts}")
else:
    print("No conflicts with the existing vocabulary.")
for token in new_tokens:
    token_id = tokenizer.encode(token, add_special_tokens=False)
    decoded_token = tokenizer.decode(token_id)
    print(f"Token: {token} - Token ID: {token_id} - Decoded Token: {decoded_token}")

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

# update preproessor
processor = WhisperProcessor.from_pretrained("openai/whisper-small", task="transcribe")
processor.tokenizer = tokenizer  # Update the processor's tokenizer
print("Feature extractor, tokenizer, and processor initialized successfully.")

# define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
model.config.forced_decoder_ids = None # remove conflict with transcribe


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Conflicting tokens: ['<->', '<HS>', '<SUS>', '<SUI>', '<UNK>', '<FPN>', '<SUQ>', '<CS>']
Token: <-> - Token ID: [51865] - Decoded Token: <->
Token: <HS> - Token ID: [51866] - Decoded Token: <HS>
Token: <SUS> - Token ID: [51867] - Decoded Token: <SUS>
Token: <SUI> - Token ID: [51868] - Decoded Token: <SUI>
Token: <UNK> - Token ID: [51869] - Decoded Token: <UNK>
Token: <FPN> - Token ID: [51870] - Decoded Token: <FPN>
Token: <SUQ> - Token ID: [51871] - Decoded Token: <SUQ>
Token: <CS> - Token ID: [51872] - Decoded Token: <CS>


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Feature extractor, tokenizer, and processor initialized successfully.


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

#Load Dataset

In [4]:
!pip install --quiet datasets >=2.6.1
import pandas as pd
from datasets import Dataset, Audio
from google.colab import drive
import os
import json

drive.mount('/content/drive')
drive_path = '/content/drive/MyDrive/Whisper_ASR'

def check_length(dur_dict, max_dur=30, error=20):
  over_dict = {}
  over_ids = []
  mismatched = {}
  gap = []
  # if it is not overtime: not relevant to my task
  mismatched_ids = []
  ids = []
  for k, x in dur_dict.items():
    dur = x.get('duration')[1]
    length = x.get('audio_length')
    gap = dur - length
    if length > max_dur:
      over_dict[k] = x
      over_ids.append(k)
      if gap > error: # more than original
        mismatched[k] = x
        ids.append(k)
  # return ids, mismatched, over_ids, over_dict
  return over_ids, over_dict

def csv_to_ds(csv_path, filter_ids):
    ref = pd.read_csv(csv_path)

    if filter_ids:
        fil_ref = ref[~ref['file_id'].isin(filter_ids)]

    print(len(ref),'->', len(fil_ref))
    ref = fil_ref
    dataset = {
        'audio_id': ref['file_id'].tolist(),
        'audio': ref['path'].tolist(),
        'text': ref['clean_transcript'].tolist()

    }

    ds = Dataset.from_dict(dataset).cast_column("audio", Audio(sampling_rate=16000))
    return ref, ds

# load overtime_ids and filter in batch
#
file_path ='/content/drive/MyDrive/Whisper_ASR/audio_length_check_dict'
with open(file_path, 'r') as f:
    dur = json.load(f)
filter_ids, filter_dict = check_length(dur)

print('Total overtime segment:', len(filter_ids))

# load and convert to dataset + length check
csv_files = ['batch1_500.csv', 'batch2_2159.csv']

csv_path = os.path.join(drive_path, 'csv_dev', csv_files[0]) # change the index only when you need to load a different
print('valid set')

valid_dict, valid_ds = csv_to_ds(csv_path, filter_ids)
csv_path = os.path.join(drive_path, 'csv_dev', csv_files[1]) # change the index only when you need to load a different idx
print('train set')
train_dict, train_ds = csv_to_ds(csv_path, filter_ids)
# output: train_ds, valid_ds

Mounted at /content/drive
Total overtime segment: 590
valid set
500 -> 500
train set
2159 -> 2159


# Setup Training

In [5]:
class whisper_training_dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len):#daatset is huggingface dataset object
        self.dataset = dataset
        self.max_len = max_len
        self.bos_token = model.config.decoder_start_token_id

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        # audio_data = down_sample_audio(item['audio']["array"], item['audio']["sampling_rate"])
        audio = item['audio']["array"]
        inputs= feature_extractor(raw_speech=audio,
                                  sampling_rate=16000,
                                  return_tensors='pt',
                                  return_attention_mask=True)
        input_feature = inputs.input_features
        feature_attention_mask = inputs.attention_mask

        # Process the transcription
        transcription = item["text"]

        # Create labels
        labels = tokenizer(transcription, padding="max_length", max_length=self.max_len, truncation=True, return_tensors="pt")
        labels = labels["input_ids"].masked_fill(labels['attention_mask'].ne(1), -100)
        labels = labels[0][1:]

        return {
            "input_features": input_feature,
            "feature_attention_mask": feature_attention_mask,
            "labels": labels
        }

In [6]:
def evaluation(model, ds):

    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    predictions=[]
    references=[]

    for sample in tqdm(ds, total=len(ds)):
        audio=sample['audio']['array']
        sample_rate=sample['audio']['sampling_rate']
        text=sample['text']

        inputs= feature_extractor(raw_speech=audio,
                                          sampling_rate=16000,
                                          return_tensors='pt',
                                  return_attention_mask=True)
        input_feature = inputs.input_features
        feature_attention_mask = inputs.attention_mask

        with torch.no_grad():
            op = model.generate(
              input_feature.to(device),
              attention_mask=feature_attention_mask.to(device),
              language='english',
              task='transcribe'
            )

        text_pred =  tokenizer.batch_decode(op,skip_special_tokens=True )[0]
        predictions.append(text_pred)
        references.append(text)

    WER = wer.compute(predictions=predictions, references=references) * 100

    return WER, predictions, references

In [7]:
# adjust valid_ds if changing dataset

dataset = whisper_training_dataset(dataset=train_ds, max_len=448)

train_dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=4,  # Adjust batch size as needed
    shuffle=True,  # Shuffle data during training
    drop_last=False
)

In [8]:
from IPython.display import clear_output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# hyperparameters
optimizer=torch.optim.AdamW(model.parameters(), lr=1e-5)
# further implementation
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

max_epochs=3
running_wer=[]
loss = []
for epoch in tqdm(range(max_epochs)):
    print(epoch)
    l = []
    for batch in train_dataloader:

        model.train()  # Set model to training mode

        # pass info to model
        input_features = batch["input_features"].to(device).squeeze(1)  # Shape: [4, 80, 3000]
        feature_attention = batch["feature_attention_mask"].to(device).squeeze(1)  # Shape: [4, 3000]
        labels = batch["labels"].to(device)  # Shape: [4, 447]

        # Forward pass
        # error: dimension mismatch in CovV1D: process input features
        # Cov1D: batch_size, feature_dim, sequence_length
        outputs = model(input_features, feature_attention, labels=labels)  # Assuming your model takes these inputs
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        # avoid exploding gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()  # Reset gradients
        print(loss.item())
        l.append(loss.item())

    wer, pred, ref = evaluation(model, valid_ds)
    running_wer.append(wer)
    loss.append(l)

    plt.plot(running_wer)
    clear_output(wait=True)
    plt.xlabel('epochs')
    plt.ylabel('wer (%)')
    plt.show()


  0%|          | 0/3 [00:00<?, ?it/s]

0


  0%|          | 0/3 [00:15<?, ?it/s]


KeyboardInterrupt: 

In [None]:
running_wer

[22.374877330716387, 19.88877984952568, 19.7252208047105]

In [None]:
torch.save(model.state_dict(), 'whispter_ft_train.pth')

## Run Eval

In [None]:
from google.colab import drive
import os
import json

drive.mount('/content/drive')
drive_path = '/content/drive/MyDrive/Whisper_ASR'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
model.load_state_dict(torch.load('whispter_ft_train.pth', weights_only=True))# loading the model
model.eval()

# change eval dataset
ds = valid_ds
model.to('cuda')

FileNotFoundError: [Errno 2] No such file or directory: 'whispter_ft_train.pth'

In [None]:
predictions=[]
references=[]

# 10 min for one
for sample in tqdm(ds, total=len(ds)):
    text = sample['text']
    audio_original = sample['audio']['array']

    inputs= feature_extractor(raw_speech=audio_original,
                                      sampling_rate=16000,
                                      return_tensors='pt',
                              return_attention_mask=True)
    input_feature = inputs.input_features
    feature_attention_mask = inputs.attention_mask

    with torch.no_grad():
        op = model.generate(
          input_feature.to(device),
          attention_mask=feature_attention_mask.to(device),
          language='english',
          task='transcribe'
        )

    text_pred =  tokenizer.batch_decode(op,skip_special_tokens=True )[0]
    predictions.append(text_pred)
    references.append(text)

  1%|          | 1/83 [00:00<00:49,  1.66it/s]



true : well to be honest i am not that i'm not a big fan of <HS> environment <SUS> <HS> but however i am still care about <HS> the <HS> the environment and trees and animals <SUS> 
pred :  Well to be honest i am not that i'm not a big fan of environment but however i'm still care about the the the environment and trees and animals




  2%|▏         | 2/83 [00:01<00:50,  1.59it/s]



true : are plenty of things we really enjoy doing <HS> during our free time <SUI> so let's say for example last week we have played football game with my friends <SUS> <HS> the other one we played online ga<-> <SUI> 
pred :  There are plenty of things we really enjoy doing during our free time. So let's say for example last week we have played football game with my friends the other one we played online game




  4%|▎         | 3/83 [00:01<00:46,  1.73it/s]



true : yes <SUS> i suppose to be simila<-> similar to my sister because <HS> she is <HS> near my age and we have some very related topics like we both enjoy playing games and computers <SUS> 
pred :  Yes i suppose to be similar to my sister because she is near my age and we have some very related topics like we both enjoy playing games and computers




  5%|▍         | 4/83 [00:02<00:47,  1.67it/s]



true : my whole life i've been living in damascus syria but in in the future i'm planning to get the nationality in canada and i'm i'm also planning to reside there for the rest of my life <SUS> 
pred :  My whole life i've been living in Damascus Syria but in in the future i'm planning to get a nationality in canada and i'm i'm also planning to reside there for the rest of my life




  6%|▌         | 5/83 [00:03<00:47,  1.64it/s]



true : there's no doubt that choosing a country is essential thing for students in order to decide in which <HS> university they want to study <SUS> so the first they should consider the safety of the country and the education equality <SUS> 
pred :  There is no doubt that choosing a country is essential thing for students in order to decide in which university they want to study. So first they should consider the safety of the country and the education equality.




  7%|▋         | 6/83 [00:03<00:50,  1.53it/s]



true : i think for sure it has many d<-> advantages like for example let's say that if you are studying in another language first of all you're going to learn simply the the culture of this language <SUS> in addition to that you'll understand the classm<-> <SUI> 
pred :  i think for sure it has many the advantages like for example let's say that if you're studying in another language first of all you're going to learn simply the the culture of this language in addition to that you will understand the class ma-




  8%|▊         | 7/83 [00:04<00:47,  1.59it/s]



true : there is no doubt that the students who want study abroad would face many difficulties <SUS> let's say for example first and foremost <HS> they are going to face <HS> the issue of doing food and cook <SUS> 
pred :  There is no doubt that the students who want study abroad would face many difficulties. Let's say for example first and foremost they're going to face the issue of doing food and cook.




 10%|▉         | 8/83 [00:04<00:47,  1.58it/s]



true : students has many other options in order to learn about the local country and the local culture <SUS> <HS> in my opinion they can go and participate in social events <SUS> and they can make some friends with local <SUI> 
pred :  Students have many other options in order to learn about the local country and the local culture <HS> in my opinion they can go and participate in social events and they can make some friends with local <SUI>




 11%|█         | 9/83 [00:05<00:45,  1.63it/s]



true : i reckon it's an essential thing because without it <HS> students cannot <HS> be independent so clearly obviously when you when a student study abroad he will depend on himself <SUS> and every<-> <SUI> 
pred :  i reckon it's an essential thing because without it students cannot be independent so clearly obviously when he when he when the student study abroad he will depend on himself and every




 12%|█▏        | 10/83 [00:06<00:45,  1.59it/s]



true : as i said before i am from a little village in north spain in the middle of nowhere and and i live in mountains so i'm very very interested in environment <SUS> and i think that global global heat is a <SUI> 
pred :  As i said before i'm from a little village in north spain in the middle of nowhere and and i live in mountains so i'm very very interested in environment and i think that global global heat is a




 13%|█▎        | 11/83 [00:06<00:43,  1.64it/s]



true : what we enjoy the most is playing video games <SUS> we love a lot of to play counter strike global offensive which is a video game a shooter game where teams of five <HS> fight <SUI> 
pred :  what win you the most is playing vital games we love a lot of to play counter strike global offensive which is a vital game a shooter game where teams of five fight in




 14%|█▍        | 12/83 [00:07<00:43,  1.63it/s]



true : i am most similar to my father <SUS> i have not only in in a physical way but in a intellectual way we have similar ways of thought and of think <SUS> and we have a lot of a <SUI> 
pred :  i'm most similar to my father i have not only in in a physical way but in a intellectual way we have similar ways of thought and of thing and we have a lot of a <SUI>




 16%|█▌        | 13/83 [00:08<00:48,  1.45it/s]



true : i would like to live in a city <SUS> i like most the cities i like the most are london paris <HS> and here in spain i would like to live in in madrid or in barcelona <SUS> i don't care <SUS> but yeah i would like to live i<-> <SUI> 
pred :  i would like to live in a city i like most the cities i like the most are london harris <HS> and here in spain i would like to live in in matrid or in barcelona i don't care but yeah i would like to live in <SUI>




 17%|█▋        | 14/83 [00:08<00:37,  1.83it/s]



true : honey is collected from bees <SUS> 
pred :  Honey is collected from base




 18%|█▊        | 15/83 [00:09<00:39,  1.73it/s]



true : i think that when a student is chosen a country to go to study there <HS> the first thing that they are thinking about is about the university quality <SUS> and at the same time at the same level univers<-> <SUI> 
pred :  i think that when a student is choosing a country to go to study there <HS> the first thing that they are thinking about is about the university quality and at the same time at the same level university <SUI>




 19%|█▉        | 16/83 [00:09<00:38,  1.72it/s]



true : i think that the best thing of studying in a second language is that you can put in in your resume <SUS> and at the same time it is a very valuable thing from that enterprises value <SUS> 
pred :  i think that the best thing of studying in a second language is that you can put in in your ratio and at the same time it is a very valuable thing from that enterprises value




 20%|██        | 17/83 [00:10<00:40,  1.61it/s]



true : i think that there are a lot of constraints when you're studying in another country <SUS> first of all you have to learn a new language and talk in a new language or in a second in your second language <SUS> and overall i think that <SUI> 
pred :  i think that there are a lot of constraints when you're studying in another country first of all you have to learn a new language and talk in a new language or in a second in your second language and overall i think that <SUI>




 22%|██▏       | 18/83 [00:11<00:40,  1.59it/s]



true : i think that students can learn <HS> from local culture <HS> going with their classmates and and learning from them asking them <SUS> and at the same time i think a good way is just taking a walk and and visi<-> <SUI> 
pred :  i think that our students can learn from local cure too going with their classmates and learning from them asking them and at the same time i think a good way is just taking a walk and and visit <SUI>




 23%|██▎       | 19/83 [00:11<00:42,  1.49it/s]



true : in my opinion studying abroad definitely helps you to be more independent because you have to do everything a<-> at your own <SUS> for example you have to to go to the bank you have to do the laundry and all those things you don't do when you live alone <SUS> 
pred :  In my opinion studying abroad definitely helps you to be more independent because you have to do everything at your own for example you have to go to the bank you have to do the laundry and all those things you don't do when you live alone




 24%|██▍       | 20/83 [00:12<00:38,  1.62it/s]



true : <FPN> city's a place without <HS> weather some work because any is is extreme <SUS> is too cold in winter and too hot in summer <SUS> 
pred :  What is cities place without weather somewhere because i mean it's it's extreme it's too cold in winter and too hot in summer




 25%|██▌       | 21/83 [00:12<00:36,  1.70it/s]



true : in holiday i like to enjoy my time because is a break <SUS> i like to travel go to the beach and s<-> sometimes i stay at home <HS> watching movies <SUS> 
pred :  On holiday i like to enjoy my time because it's a break i like to travel go to the beach and sometimes i stay at home and watching movies




 27%|██▋       | 22/83 [00:13<00:34,  1.75it/s]



true : <HS> i like spend time with all my family <HS> with my brothers with my sister with my fathers parents and also with my cousins <SUS> 
pred :  i like spend time with all my family <HS> with my brothers with my sister my father's parents and i talk with my cousins <SUS>




 28%|██▊       | 23/83 [00:13<00:34,  1.76it/s]



true : in the future i would like to work in a company called johnson and johnson <SUS> and i would like the position of the engineer senior in the area of the develop new products <SUS> 
pred :  In the future i would like to work in a company called Johnson & Johnson and i would like the position of the engineer senior in the area of the develop new products




 29%|██▉       | 24/83 [00:14<00:34,  1.71it/s]



true : technology offer us a lot of tools to improve our way to learn <SUS> one of them is that we can u<-> we can look for all the information using internet as we can look for examples in videos <SUS> 
pred :  Technology offers us a lot of tools to improve or a way to learn. One of them is that we can look for all the information using internet and also we can look for examples in videos.




 30%|███       | 25/83 [00:15<00:33,  1.72it/s]



true : face - to - face is the best way to learn because you can look your professor and you can concentrate best <SUS> in this way you have not distractors <SUS> 
pred : <SUS><SUS> face-to-face is the best way to learn because you can look at your professor and you can concentrate best in this way you have no disorders <SUS>




 31%|███▏      | 26/83 [00:15<00:32,  1.78it/s]



true : the group projects give us the opportunity to learn to work in teams to use all the abilities of one of each one of the members on in a team <SUS> 
pred :  The group projects give us the opportunity to learn to work in teams to use all the abilities of one of each one of the members in a team.




 33%|███▎      | 27/83 [00:16<00:31,  1.78it/s]



true : attending lectures by yourself in my opinion it's best because you can pause when you need <SUS> instead when you are attending le<-> lectures with other people you cannot do that <SUS> 
pred :  Attending lectures by yourself in my opinion is best because you can't pause when you need instead when you are attending lea<-> letters with other people you cannot do that




 34%|███▎      | 28/83 [00:16<00:32,  1.72it/s]



true : it's important to for a university or schools offer <UNK> ways to learn because all students are different and they learn in a different way <SUS> for example in my own experience i learn best when i am listening <SUS> 
pred :  It's important to write university or schools offer different ways to learn because all students are different and they learn in a different way for example in my own experience i learn best when i'm listening




 35%|███▍      | 29/83 [00:17<00:32,  1.68it/s]



true : the place where i live is is between two mo<-> mountains between <FPN> and <FPN> <SUS> is called <SUS> <SUS> is to one hour to barcelona <HS> to half an hour to the to the beach <SUS> it's a nice place <SUS> 
pred :  The place where i live is is between two mountains between montain and monnegra is called<SUS><UNK>




 36%|███▌      | 30/83 [00:18<00:30,  1.75it/s]



true : in holiday i enjoy staying with friends and going to the mountains sometimes to the beach <SUS> i enjoying in general the nature and without <HS> obligations without <HS> club <SUS> 
pred :  In holiday i enjoy staying with friends and going to the mountains sometimes to the beach enjoying in general the nature and without the obligations without the cloud




 37%|███▋      | 31/83 [00:18<00:27,  1.89it/s]



true : in my close family maybe with my mother <SUS> in my don<-> in my second close family maybe with my brother-in-law <SUS> 
pred :  In my close family maybe with my mother in my in my second close family maybe with my brother in law




 39%|███▊      | 32/83 [00:18<00:24,  2.04it/s]



true : i am not sure but one possibility of job is as a teacher <SUS> is the one of my favorite jobs now <SUS> 
pred :  i am not sure but one possibility of job is as a teacher the one of my favourite jobs now




 40%|███▉      | 33/83 [00:19<00:26,  1.89it/s]



true : i think that the technology can can improve the study because you can find <HS> faster <HS> in the information and at the same time you can stay in connecting with oth<-> with other bit of people <SUS> 
pred :  i think that the technology can can improve the study because you can find faster <HS> in the information and at the same time you can stay in connecting with other with other people <SUS>




 41%|████      | 34/83 [00:20<00:27,  1.76it/s]



true : me too i think the same <SUS> i think that the face - to - face you can you are more <HS> more interested at with the person with them <SUS> and you can <HS> interpretate the the non-verbal <SUI> 
pred :  Me too i think the same i think that face-to-face you can you are more more more interested with the person with the them and you can interpretate the the non-verbal




 42%|████▏     | 35/83 [00:20<00:26,  1.82it/s]



true : with group projects the people can improve develop <HS> the the past the patience the the quiet listening and thinking <HS> more deep and the communication more efficient <SUS> 
pred :  With group projects the people can improve the developer the past the past the patterns the be quiet listening and thinking more deep and the communication more efficient




 43%|████▎     | 36/83 [00:21<00:24,  1.92it/s]



true : in my case i think that it's better <HS> reading alone because i prefer the silent and <HS> the quietness and the calm <SUS> 
pred :  In my case i think that it's better to be alone because i prefer the silent and the quietness and the calm




 45%|████▍     | 37/83 [00:21<00:24,  1.91it/s]



true : it's important because not all the people have the possibility to to to have the all facilities <SUS> then we must to to give all the modern facilities th<-> to to to all the students to <SUI> 
pred :  It's important because not all the people have the possibility to have all the facilities then we must to give all the facilities to all the students to study




 46%|████▌     | 38/83 [00:22<00:25,  1.77it/s]



true : my favourite time is summer coz it's hot you can spend a lot of time outside <SUS> and you can you can walk in the street have taking some coffee <UNK> <SUS> and such a good season for me <SUS> 
pred :  My favourite time here is summer because it's hot you can spend a lot of time outside and you can you can work in the street taking some coffee in terrace and that's a good season for me




 47%|████▋     | 39/83 [00:23<00:27,  1.59it/s]



true : that really matters to me to to inform myself about what is happening in the world <SUS> i think that's really important to be to be aware of yeah what what what's happening around to me <SUS> so i really enjoy listening to radio in the morning <SUS> 
pred :  That really matters to me to to to inform myself about what is happening in the world i think that's really important to be to be aware of yeah what's what's what's happening around me so i really enjoy listening to the radio in the morning




 48%|████▊     | 40/83 [00:23<00:26,  1.63it/s]



true : <HS> what i preferred when i was a child was playing with little cars and imagine <HS> a kind of city with homes buildings schools <SUS> and <HS> i was i was a creative kid i think <SUS> 
pred :  What i preferred when i was a child was playing with little cars and imagine a kind of city with homes buildings schools and i was i was a creative kid i think




 49%|████▉     | 41/83 [00:24<00:27,  1.54it/s]



true : i really hope to to have the opportunity to live abroad coz i think it's an incredible human experience <SUS> for instance i'd like to to go to america on the west coast in a city like san francisco and just enjoying <SUS> 
pred :  I really hope to have the opportunity to live abroad because i think it's an incredible human experience for instance i'd like to to go to america on the west coast in a city like<SUS> san francisco and just enjoying <SUS>




 51%|█████     | 42/83 [00:25<00:26,  1.53it/s]



true : get the problem with very careful which other <SUI> there are able to understand their needs and their problem <SUS> they communicate a lot with their peer <SUS> and i think it they that that's a good leader to to very communicate <SUS> 
pred :  Thank you to person who is very careful which other they are able to understand their needs and their problem they communicate a lot with their peers and i think they they that's a good leader to to very communicate




 52%|█████▏    | 43/83 [00:25<00:26,  1.49it/s]



true : i think the the main issue <HS> that you have so much so much work to to do everything <SUS> and plus i think i i am deeply convince that having a small team can create <HS> anxiousty and problems <SUS> 
pred :  i think the the main issue is that you have so much so much work to do everything <SUS> and plus i think <UNK> i'm deeply convinced that having a small team can create some anxiety and problems <SUS>




 53%|█████▎    | 44/83 [00:26<00:26,  1.46it/s]



true : they they can be disagree but i think that's important and really relevant to to have disagree in a group coz you have to to explain your opinion and advocate for you for your ideas <SUS> and i think that's a good exercise to to <SUI> 
pred :  they they can be disagree but i think that's important and really relevant to to have disagree in a group because you have to to explain your opinion and advocate for you for your ideas and i think that's a good exercise to to <SUI>




 54%|█████▍    | 45/83 [00:27<00:26,  1.46it/s]



true : i th<-> i think the core of of a team is working together but doesn't matter if you if you work face-to-face or if you if you work at home for instance <SUS> <HS> you you you just have to really re<-> communicate <SUS> explain what you do <SUS> explain what you attempt to <SUI> 
pred :  I think the core of a team is working together but doesn't matter if you work face-to-face or if you work at home for instance you just have to really communicate explain what you do explain what you attend to




 55%|█████▌    | 46/83 [00:27<00:24,  1.50it/s]



true : i'm deeply convinced that you can do a better work if you share your opinion and your ideas with coworkers <SUS> but i'm not i i'm not sure that people working alone are bad at work <SUS> 
pred :  You play convinced that you can do a better work if you share your opinion and your ideas with co-workers but i'm not i'm not sure that people working alone are bad at work




 57%|█████▋    | 47/83 [00:28<00:23,  1.56it/s]



true : interested in learning about other countries <SUI> <HS> i enjoy because i enjoy travelling so the best way to to going travelling is knowing about the the other cultures <SUS> so it's the best way <SUS> 
pred :  interesting in learning about other countries <HS> enjoy because i enjoy travelling so the best way to to enjoy travelling is knowing about the the other cultures so it's the best way <SUS>




 58%|█████▊    | 48/83 [00:28<00:21,  1.61it/s]



true : i enjoy hiking reading studying and spending time with my family <SUS> and well my free time i try to enjoy a lot <SUS> so a lot of hobbies i have <SUS> and work out <SUS> 
pred :  i enjoy hiking reading studying spending time with my family and well my free time i try to to enjoy a lot so a lot of hobbies i have and work out <SUS>




 59%|█████▉    | 49/83 [00:29<00:20,  1.66it/s]



true : obviously the the best way to improve my english is travelling abroad where to countries where they speak english <SUS> but <HS> it's impossible to to travel all the whole year so studying <SUS> 
pred :  Obviously the the best way to improve my english is travelling abroad to countries with speak english but it's impossible to to travel all the world year so studying <SUS>




 60%|██████    | 50/83 [00:30<00:19,  1.66it/s]



true : i'd like to to paint and to draw <SUI> <HS> however it's difficult so i'm studying the history of art <HS> as i can understand the the the painting <SUS> 
pred :  i would like to to paint <HS> to draw <HS> however it's difficult so i'm studying a history of art <HS> as i can understand the the the paint <SUS>




 61%|██████▏   | 51/83 [00:30<00:18,  1.74it/s]



true : the the advice i would give is to to eat <HS> mostly fruit and fresh vegetables <HS> and <HS> give up <HS> fast food and fat <HS> food that <HS> can damage our <HS> our health <HS> <SUS> 
pred :  The advice i could give is to eat mostly fruits and fresh vegetables and give up fast food and fat food that can damage our health <SUS>




 63%|██████▎   | 52/83 [00:31<00:16,  1.83it/s]



true : my opinion <HS> drinking water is <HS> is the best way to to to drink because is the is <HS> the healthiest <HS> drink <HS> you can you can get <SUI> <HS> so this affects your your health <HS> <SUI> 
pred : <CS> drinking water is the best way to drink because it's the healthiest drink you can get <SUS> it affects your health <SUS>




 64%|██████▍   | 53/83 [00:31<00:17,  1.70it/s]



true : ercise<-> <HS> is very important for our health <SUI> <HS> every day maybe we can we can have <HS> enough free time <SUS> however <HS> three or four times a <HS> per week is very is essential to to fit <SUI> 
pred : <FPN><FPN> is very important for our health <SUS> every day maybe we can we can have <HS> enough free time <SUS> however <HS> three or four times a week is very is essential to to feed <SUI>




 65%|██████▌   | 54/83 [00:32<00:18,  1.54it/s]



true : i think is better to to sleep eight hour <SUS> however <HS> sometime is impossible to to do it because our <HS> busy style life <HS> don't <HS> d<-> doesn't allowed <HS> to to sleep eight hou<-> hours <SUS> however we should i<-> <SUI> 
pred :  i think it's better to to sleep eight hour however in some time it's impossible to to do it because our <HS> vc's don't live <HS> don't <HS> doesn't allow to to to sleep eight hours however we should eat <SUS>




 66%|██████▋   | 55/83 [00:33<00:17,  1.62it/s]



true : technology can have a positive or negatives <HS> effects <SUS> <HS> it depends on the use we we we use the the technology <HS> because <HS> could be very bad for our <HS> ou<-> <SUI> 
pred :  technology can have a positive or negative effects it depends on the use we we we use the technology <HS> because it could be very bad for our <HS> <SUI>




 67%|██████▋   | 56/83 [00:33<00:17,  1.53it/s]



true : i am very interested in learning about other countries <SUS> and personally i did <HS> study abroad in ireland and i met a lot of people from different countries <HS> especially from spain and poland and germany <SUS> so i am very interested in learning from the cultures <SUS> 
pred :  i am very interested in learning about other countries personally i did a study abroad in ireland and i met a lot of people from different countries especially from spain and <SUQ> and german so i am very interested in learning from the cultures <SUS>




 69%|██████▊   | 57/83 [00:34<00:16,  1.62it/s]



true : i enjoy spending my free time with my family watching movies or or series also by making exercise with them hiking or walking around the house <SUS> and yes <SUS> 
pred :  i enjoy spending my free time with my family watching movies or or series also by making exercise with them hiking or walking around the house <UNK> yes <SUS>




 70%|██████▉   | 58/83 [00:34<00:14,  1.71it/s]



true : the best way to improve my english is by watching movies or series also by reading books and nor<-> and normals so that i can learn my skills in english <SUS> 
pred :  The best way to improve my english is by watching movies or series also by reading books and on normals so that i can learn my skills in english




 71%|███████   | 59/83 [00:35<00:14,  1.66it/s]



true : i think i would learn to read more and read faster because i am not that fast at reading <SUS> so i would like to learn that to read more books and to learn more about the cultures of the world <SUS> 
pred :  i think i will learn to read more and read faster because i'm not a faster reading so i would like to learn that to read more books and to learn more about the cultures of the world <SUS>




 72%|███████▏  | 60/83 [00:36<00:14,  1.64it/s]



true : first of all they need to delete every candy they have in their house so that they can eat and start eating good food and to have healthy life like fruit and vegetables <SUS> and start doing a menu <SUS> 
pred :  First of all they need to delete every candy they have in their house so that they can eat and start eating good food and to have healthy life like fruits and vegetables and start doing a mini




 73%|███████▎  | 61/83 [00:36<00:13,  1.67it/s]



true : i consider that drinking water is a good form of starting a hal healthy lifestyle <SUS> drinking two litres per day is a good amount of water to the hydratated and to be happy in your day <SUS> 
pred :  i consider that drinking water is a good form of studying a healthy lifestyle drinking two litres per day is a good amount of water to be hydrated and to be happy in your day




 75%|███████▍  | 62/83 [00:37<00:13,  1.58it/s]



true : doing exercise is a good way of starting a healthy li<-> living <SUS> you can do exercise three or four times a week not in expensive like every day <SUS> but three or four hours a week walking around or hiking it's a good <SUI> 
pred :  doing exercise is a good way of studying a healthy living <SUS> you can do exercise three or four times a week not inexpensive like every day but three or four hours a week walking around or hiking it's a good <SUI>




 76%|███████▌  | 63/83 [00:38<00:12,  1.61it/s]



true : i consider that eight hours sleeping in the night is a good way of starting a healthy lifestyle <SUS> there are research <HS> papers or researcher <HS> that say that sleeping less will make you <HS> <UNK> <SUS> 
pred :  i consider that eight hours sleeping in the night is a good way of studying a healthy lifestyle there are research papers or researchers that say that sleeping less will make you a a sufferer




 77%|███████▋  | 64/83 [00:38<00:11,  1.60it/s]



true : i consider technology has impact in a bad form to the healthy lifestyle because many people are now watching their cellphones all the time instead of going outside and doing exercise <SUS> and that's bad because <SUI> 
pred :  i consider technology has impact in a bad form to the healthy lifestyle because many people are now watching their cell phones all the time instead of going outside and doing exercise and that's bad because yeah <SUI>




 78%|███████▊  | 65/83 [00:38<00:09,  1.94it/s]



true : well i am interested really <SUS> i'm into it <SUS> 
pred :  well i am interested really i'm into it




 80%|███████▉  | 66/83 [00:39<00:09,  1.82it/s]



true : well i don't have much free time but <HS> i draw sometimes walk around the city go sightseeing or just visit museums <SUS> and i adore watching films and movie <HS> films and series <SUS> 
pred :  Well i don't have much free time but <HS> i draw sometimes walk around the city go sightseeing or just visit museums and i adore watching films and movies or films and series <SUS>




 81%|████████  | 67/83 [00:39<00:08,  1.94it/s]



true : well i think <HS> about <HS> with diving into the context of english language <SUS> i mean watching <HS> serials and films really helps me improve my language <SUS> 
pred :  Well i think about with diving into the context of english language i mean watching series and films really helps me improve my language




 82%|████████▏ | 68/83 [00:40<00:07,  2.03it/s]



true : well i would like to take up sport classes <SUS> and i think <HS> like this discipline really would help me <SUS> 
pred :  Well i would like to take up sport classes and i think <HS> <HS> like this discipline really would help me




 83%|████████▎ | 69/83 [00:41<00:07,  1.86it/s]



true : i would recommend these people to listen to what their gut feelings <HS> tell them because <HS> they would know what food re<-> they really want and not their brain but their body wants to eat <SUS> 
pred :  i would recommend these people to listen to their gut feelings <HS> tell them because <HS> <HS> they would know what food really they really want and not their brain but their body wants to eat <SUS>




 84%|████████▍ | 70/83 [00:41<00:07,  1.84it/s]



true : it is essential to drink lots of water on daily basis because our body consists of water on eighty percent and <HS> it just would really <HS> start their metabolism to work better <SUS> 
pred :  It is essential to drink lots of water on daily basis because our body consists of water on eighty percent and it just would really start their metabolism to work better.




 86%|████████▌ | 71/83 [00:42<00:06,  1.74it/s]



true : it is major for people <HS> to do little exercises every day like after waking up and before they go to sleep i guess too <SUS> <HS> this will cause <HS> <HS> but <HS> little movings <HS> would help them to become <SUI> 
pred :  It is major for people to do little exercises every day like after waking up and before they go to sleep i guess to these workouts that have to be long but little movements would help them to be cal<->




 87%|████████▋ | 72/83 [00:42<00:06,  1.72it/s]



true : i agree with the opinion that sleeping around <HS> eight hours a day is a key habit because if you don't sleep enough you would <HS> be tired and <HS> won't be productive at all <SUS> 
pred :  i agree with the opinion that sleeping around eight hours a day is a key habit because if you don't sleep enough you would be tired and won't be productive at all <SUS>




 88%|████████▊ | 73/83 [00:43<00:05,  1.82it/s]



true : it's quite question<-> a questionable opinion because technologies truly helps <HS> help people to make their lives easier but <HS> but not always <SUS> they can harm health too i guess <SUS> 
pred :  It's quite a questionable opinion because technology truly helps people to make their lives easier but not always they can harm health too i guess




 89%|████████▉ | 74/83 [00:43<00:05,  1.78it/s]



true : where i live i have more pantries and we have the best food on my on my country spain <SUS> it's paella and it's a wonderful rice <SUS> another important point is the language spanish <SUS> 
pred :  Where i live i have more countries and we have the best food on my country spain it's paella and it's a wonderful rice a no important point is the language spanish




 90%|█████████ | 75/83 [00:44<00:04,  1.65it/s]



true : on holidays i enjoy going with my family to the beach or with my friend to play football handball or another sport because it's incredible and wonderful playing with your friends <SUS> and i also play video games with my brother <SUS> it's wonderful <SUS> 
pred :  On holidays i enjoy going with my family to the beach or with my friends to play football handball or another sport because it's an incredible wonderful playing with your friends and i also play video games with my brother it's wonderful <SUS>




 92%|█████████▏| 76/83 [00:45<00:04,  1.62it/s]



true : i personally think that i spend more time with my brother because my father is always working and this is terrible <SUS> because if they are working all day i can't see every day and this is a terrible problem <SUS> 
pred :  i personally think that i spend more time with my brother because my hair is always working and this is terrible because if they are working all day i can see every day and this is a hardware problem <SUS>




 93%|█████████▎| 77/83 [00:45<00:03,  1.62it/s]



true : in the future if i have the possibility i want to do nursing to help people or a policeman because they can help people <SUS> and it's wonderful and impressive to he<-> help people and gain money <SUS> 
pred :  In the future if i have the possibility i want to do nursing to help people or a policeman because they can help people and it's wonderful and impressive to help help people and gain money <SUS>




 94%|█████████▍| 78/83 [00:46<00:03,  1.64it/s]



true : for from my point of view students can't study more efficient with technology because they can use google by computer because they can find more information than in a library <SUS> and this essential for for a student <SUS> 
pred :  From my point of view students can't study more efficient with technology because they can use google by computer because they can find more information than in a library and this is essential for a student




 95%|█████████▌| 79/83 [00:47<00:02,  1.55it/s]



true : i personally think that study face-to-face is better than studying online because you are attending face-to-face more than in a computer because in a computer you you can stay playing with the mobile phone <SUS> and this is a problem <SUS> 
pred :  i personally think that study face-to-face is better than studying online because you are attending face-to-face more than a computer because in a computer you you can stay playing with the mobile phone and this is a problem <SUS>




 96%|█████████▋| 80/83 [00:47<00:01,  1.63it/s]



true : studying group projects there are <UNK> a<-> abilities or skills <SUS> and this is essential for the future because nowadays companies are working in groups <SUS> and this is essential for your future <SUS> 
pred :  Studying group projects develop your sustainable abilities or skills and this is essential for the future because nowadays companies are working in groups and this is essential for your future.




 98%|█████████▊| 81/83 [00:48<00:01,  1.60it/s]



true : in my opinion attending lectures is better than reading alone because attending lectures you have you have interactive <HS> speaking with a person or with a <UNK> <SUS> and this is better than reading alone that is bored <SUS> 
pred :  In my opinion attending lectures is better than reading alone because attending a lecture you have you have interactive speaking with a person or with a videoing and this is better than reading alone that is boring




 99%|█████████▉| 82/83 [00:49<00:00,  1.51it/s]



true : from my point of view is the most important to have learning facilities because at university if you not have a computer or another technology you cannot learn more than other students <SUS> and this is a difficult in others countries <SUS> and this is essenti<-> <SUI> 
pred :  From a point of view it's the most important to have learning facilities because at a university if you not have a computer or another technology you cannot learn more than other students and this is a difficult in other countries and this is essential




100%|██████████| 83/83 [00:49<00:00,  1.67it/s]



true : i am very interested in know about other countries <SUS> but i'm in love with barcelona and i'm in love with catalonia and spain too <SUS> 
pred :  i'm very interested in know about other countries but i'm in love with basalona and i'm in love with catalonia and spain too







# Run Baseline Pipeline

In [9]:
!pip install tqdm datasets transformers
import pandas as pd
from datasets import Dataset, Audio
from google.colab import drive
from tqdm import tqdm
import os
import json
import torch
from transformers import WhisperTokenizer, WhisperFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
def mount_drive():
    drive.mount('/content/drive')

def check_length(dur_dict, max_dur=30, error=20):
    over_ids = [k for k, x in dur_dict.items() if x.get('audio_length') > max_dur and (x.get('duration')[1] - x.get('audio_length')) > error]
    return over_ids

def csv_to_ds(csv_path, filter_ids):
    ref = pd.read_csv(csv_path)
    if filter_ids:
        ref = ref[~ref['file_id'].isin(filter_ids)]
    print(f"Filtered {len(ref)} records out of {len(filter_ids)}")
    dataset = {
        'audio_id': ref['file_id'].tolist(),
        'audio': ref['path'].tolist(),
        'transcript': ref['clean_transcript'].tolist()
    }
    ds = Dataset.from_dict(dataset).cast_column("audio", Audio(sampling_rate=16000))
    return ref, ds

def initialize_tokenizer_processor():
    feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small",task="transcribe")
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
    new_tokens = ['<->', '<HS>', '<SUS>', '<SUI>', '<UNK>', '<FPN>', '<SUQ>', '<CS>']
    tokenizer.add_tokens(new_tokens)
    processor = WhisperProcessor.from_pretrained("openai/whisper-small", task="transcribe")
    processor.tokenizer = tokenizer
    return tokenizer, processor

def initialize_model(tokenizer):
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
    model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
    model.config.forced_decoder_ids = None
    return model

def perform_inference(ds, model, tokenizer, feature_extractor):
    predictions = []
    references = []
    for sample in tqdm(ds, total=len(ds)):
        text = sample['transcript']
        audio_original = sample['audio']['array']
        inputs = feature_extractor(raw_speech=audio_original, sampling_rate=16000, return_tensors='pt', return_attention_mask=True)
        input_feature = inputs.input_features
        feature_attention_mask = inputs.attention_mask
        with torch.no_grad():
            op = model.generate(input_feature.to(device), attention_mask=feature_attention_mask.to(device), language='english', task='transcribe')
        text_pred = tokenizer.batch_decode(op, skip_special_tokens=True)[0]
        predictions.append(text_pred)
        references.append(text)
    return predictions, references

def run_baseline_performance(drive_path, csv_files, audio_length_check_dict_path, max_dur=30, error=20):
    # Mount Drive
    mount_drive()

    # Load and filter overtime_ids
    with open(audio_length_check_dict_path, 'r') as f:
        dur = json.load(f)
    filter_ids = check_length(dur, max_dur, error)
    print('Total overtime segment:', len(filter_ids))

    # Load and filter CSV files
    csv_path = os.path.join(drive_path, 'csv_dev', csv_files[0])
    print('Valid set')
    valid_dict, valid_ds = csv_to_ds(csv_path, filter_ids)

    # Initialize tokenizer, processor, and model
    tokenizer, processor = initialize_tokenizer_processor()
    feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = initialize_model(tokenizer)

    # Perform inference
    predictions, references = perform_inference(valid_ds, model, tokenizer, feature_extractor)

    return predictions, references

In [None]:
# example usages:
drive_path = '/content/drive/MyDrive/Whisper_ASR'

# Batch_size, dict.keys = 'file_id', 'path', 'clean_transcript'
# csv_files = ['batch1_100.csv', 'batch2_1000.csv', 'batch3_1000.csv', 'batch4_1149.csv']
csv_files = ['batch1_500.csv', 'batch2_2159.csv']

filter_audio_length_path = '/content/drive/MyDrive/Whisper_ASR/audio_length_check_dict'

predictions, references = run_baseline_performance(drive_path, csv_files, filter_audio_length_path)

# Optional: Print or save the results
# for pred, ref in zip(predictions, references):
#     print(f"Prediction: {pred}\nReference: {ref}\n")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total overtime segment: 24
Valid set
Filtered 500 records out of 24


  0%|          | 0/500 [00:00<?, ?it/s]You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
 12%|█▏        | 61/500 [02:09<13:39,  1.87s/it]

# Percentages of correct tags

In [None]:
from transformers import PreTrainedTokenizerFast

# Define your special tokens
special_tokens = ['<->', '<HS>', '<SUS>', '<SUI>', '<UNK>', '<FPN>', '<SUQ>', '<CS>']

# Initialize the tokenizer (using a pre-trained tokenizer as a base)
tokenizer = PreTrainedTokenizerFast.from_pretrained('bert-base-uncased')

# Add the special tokens to the tokenizer
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

def tokenize_s(sentence):
    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)

    # Encode the sentence (convert tokens to IDs)
    encoded = tokenizer.encode(sentence)

    # Decode the IDs back to the sentence
    decoded = tokenizer.decode(encoded)

    return tokens

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
def tag_counts(ls):
    tokens = ['<->', '<HS>', '<SUS>', '<SUI>', '<UNK>', '<FPN>', '<SUQ>', '<CS>']

    # Initialize the dictionary with tokens as keys and counts as values
    counts = {token: 0 for token in tokens}

    # Tokenize each string in the list
    tokenized = [tokenize_s(s) for s in ls]

    # Update the counts based on the tokens found
    for tokens_list in tokenized:
        for token in tokens_list:
            if token in counts:
                counts[token] += 1

    return counts

In [None]:
pred_tags = tag_counts(pred_ls)
ref_tags = tag_counts(ref_ls)
print(pred_tags)
print(ref_tags)

{'<->': 0, '<HS>': 1, '<SUS>': 1, '<SUI>': 1, '<UNK>': 1, '<FPN>': 1, '<SUQ>': 2, '<CS>': 1}
