### Load Dataset

In [48]:
import pandas as pd

In [49]:
df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQMaaNO_0JU-A2gdSyJpF-WEjJGqWqZdIIp9g9gHGpTdJ3G8l6BvV1PvtmrB3nUTHxnDC_zbiAp3sJx/pub?gid=353977511&single=true&output=csv")
df.rename(columns={'Unnamed: 0': 'id', "_comments_ASR_unfiltered": "ASR"}, inplace=True)
df = df[['id', 'label', 'captions', 'ASR']]
exclude_id  =[60580, 80644, 80662, 80670, 80685, 80716, 114392, 114403, 114409, 114410]
df = df[~df['id'].isin(exclude_id)]
df = df[df['label'].isin(['Shots on target', 'Shots off target', 'Foul', 'Corner', 'Throw-in'])]
df.head()

Unnamed: 0,id,label,captions,ASR
0,36,Shots on target,Player hears the fans clapping his superb goal...,Chelsea has recovered again in the final third...
1,44,Corner,Corner kick. Player (Away-Team) is ready to se...,Tripper. Ready to strike. There goes Trippier'...
2,57,Shots off target,A cross by Player (Home-Team) from the side of...,There's Hazard. Hazard made a good move toward...
3,74,Throw-in,Player (Home-Team) takes a first-time shot fro...,Diego Costa anticipates. Jones can score. It e...
7,160,Shots on target,"Player (Home-Team) picks up a pass, lines up a...","He wants those races, even if it's towards the..."


In [50]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel, WhisperForAudioClassification
import torch
from tqdm import tqdm  # Import tqdm
import os
import torch
import torchaudio
import numpy as np
from torchaudio.transforms import Resample
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from tqdm import tqdm


from transformers import AutoFeatureExtractor, WhisperModel

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to(device)

# # column_type = "ASR"   #ASR, captions
# for  column_type in ["ASR", "captions"]:
#   captions = df[column_type].tolist()# max_len = 0

#   all_embeddings = torch.empty((0,model.config.hidden_size, )).cpu() #model.config.max_position_embeddings
#   batch_size = 256 

#   for i in tqdm(range(0, len(captions), batch_size), desc="Processing captions"):
#       batch_captions = captions[i:i+batch_size]
#       inputs = tokenizer(batch_captions, padding='max_length', truncation=True, return_tensors="pt")
      
#       inputs = {key: value.to(device) for key, value in inputs.items()}
      
#       with torch.no_grad():
#           outputs = model(**inputs)
      
#       all_embeddings = torch.cat((all_embeddings, outputs.pooler_output.cpu()), dim=0)

#   all_embeddings_cpu = all_embeddings
#   print(all_embeddings_cpu.shape)
#   torch.save((all_embeddings_cpu,df["id"].tolist()) , column_type+ ".pt")

In [51]:
# Load the Whisper model and the feature extractor
model_audio = WhisperModel.from_pretrained("openai/whisper-base")
feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")

In [52]:
from torch.utils.data import DataLoader, Dataset

class AudioDataset(Dataset):
    def __init__(self, folder_path, file_paths):
        self.folder_path = folder_path
        self.file_paths = file_paths

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        global resampled_audio_padded, resampled_audio
        file_path = os.path.join(self.folder_path, self.file_paths[idx])
        file_idx = int(file_path.split("/")[-1].split("_")[0])
        # filter
        waveform, sample_rate = torchaudio.load(file_path)
        resampler = Resample(orig_freq=sample_rate, new_freq=16000)
        resampled_audio = resampler(waveform).mean(dim=0)
        return file_idx, feature_extractor(resampled_audio, sampling_rate=16000, return_tensors="pt").input_features.squeeze(0)

def batch_process(folder_path,  batch_size=64, _start_idx= []):
    global features_tensor, pooled_outputs, pooled_outputs, outputs, all_features, batch, decoder_input_ids
    video_files = [f for f in os.listdir(folder_path) if f.endswith('.mp4')]
    video_ids = [f.split('_')[0]+"_" for f in video_files]
    filtered_video_files = [f for f, idx in zip(video_files, video_ids) if idx in _start_idx]
    filtered_video_files.sort(key=lambda x: int(x.split('_')[0]))    
    dataset = AudioDataset(folder_path, filtered_video_files)
    print(f"Dataset size: {len(dataset)}")
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    
    all_features = []
    all_idx = []
    for file_idxs, batch in tqdm(data_loader, desc="Processing batches"):
        decoder_input_ids = (torch.tensor([1]) * model_audio.config.decoder_start_token_id).repeat(len(batch), 1)
        with torch.no_grad():
            outputs = model_audio(batch, decoder_input_ids=decoder_input_ids).encoder_last_hidden_state
            pooled_outputs = torch.mean(outputs, dim=1).cpu().numpy() # pooling
            all_features.extend(pooled_outputs)
            all_idx.extend(file_idxs.numpy())
  
    final_tensor = torch.tensor(np.array(all_features))
    print(final_tensor.shape)
    torch.save((final_tensor,all_idx) , 'processed_audio_features.pt')

In [54]:
starting_indices_ = list(df['id'].apply(lambda x: str(x) + "_").values)
# exclude_id  =["60580_", "80644_", "80662_", "80670_", "80685_", "80716_", "114392_", "114403_", "114409_", "114410_"]
# starting_indices = list(set(starting_indices_)-set(exclude_id))
batch_process('/home/sushant/D1/MyDataSets/SN_Chunks_1ECapASR_10k', batch_size=64, _start_idx=starting_indices_)

Dataset size: 7310


Processing batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 115/115 [2:03:38<00:00, 64.51s/it]


torch.Size([7310, 512])


In [36]:
# import subprocess
# import os

# def is_ffmpeg_callable(filename):
#     try:
#         waveform, sample_rate = torchaudio.load(filename)
#         return True
#     except TypeError:
#         return False

# directory ='/home/sushant/D1/MyDataSets/SN_Chunks_1ECapASR_10k/'
# non_callable_files = []


# video_files = [f for f in os.listdir(directory) if f.endswith('.mp4')]
# video_ids = [f.split('_')[0]+"_" for f in video_files]
# filtered_video_files = [f for f, idx in zip(video_files, video_ids) if idx in starting_indices]
# filtered_video_files.sort(key=lambda x: int(x.split('_')[0]))  
# for filepath in filtered_video_files:
#     filepath = directory+ filepath
#     if os.path.isfile(filepath):
#         if not is_ffmpeg_callable(filepath):
#             non_callable_files.append(filepath)

# print("Files not callable with ffmpeg get_src_stream_info:")
# for file in non_callable_files:
#     print(file)