### Data Loading

In [33]:
import pandas as pd 
data = pd.read_csv("train.csv")
data.shape

(91254, 9)

### Creating Vggish Embeddings

In [4]:
import os
import torch
import librosa
import numpy as np
from tqdm import tqdm
from torchvggish import vggish, vggish_input

In [5]:

audio_folder = "data/rawaudio"         
save_file = "data/audio_embeddings.npy"     
device = "cpu"
model = vggish()
model.to(device)
model.eval()       # must contain a column "path"
torch.save(model.state_dict(), "vggish_pretrained.pth")
print("Saved VGGish weights to vggish_pretrained.pth")

Saved VGGish weights to vggish_pretrained.pth


In [None]:

embeddings_list = []
files_list = []

for idx, path in enumerate(tqdm(paths)):
    if idx < 66675  : 
        continue 
    try:
        audio, sr = librosa.load(path, sr=16000, mono=True)
        examples = vggish_input.waveform_to_examples(audio, sr)

        # If no patches (empty case), create a zero vector
        if examples.shape[0] == 0:
            print("Empty or too short, replacing with zeros:", path)
            emb = np.zeros(128, dtype=np.float32)

        else:
            examples = torch.tensor(examples).float()

            with torch.no_grad():
                emb_out = model(examples)      # (N, 128)
                emb = emb_out.numpy()   # (128,)

        embeddings_list.append(emb)
        files_list.append(path)

    except Exception as e:
        print("Error processing", path, e)
        # Also store zero embedding for corrupted files
        embeddings_list.append(np.zeros(128, dtype=np.float32))
        files_list.append(path)
        continue

# Convert final embeddings list to numpy array
embeddings_array = np.vstack(embeddings_list)
print("Final embeddings shape:", embeddings_array.shape)


  examples = torch.tensor(examples).float()
 45%|████▍     | 40644/91254 [58:39<2:06:19,  6.68it/s]  

Empty or too short, replacing with zeros: data/rawaudio/Fq107k8_9vk_0.wav


 63%|██████▎   | 57302/91254 [1:54:35<1:26:15,  6.56it/s]

Empty or too short, replacing with zeros: data/rawaudio/zhopcEHglEI_0.wav


 73%|███████▎  | 66675/91254 [2:25:06<53:29,  7.66it/s]  


KeyboardInterrupt: 

In [3]:
len(embeddings_list)

NameError: name 'embeddings_list' is not defined

In [None]:
embeddings_array = np.vstack(embeddings_list)
print("Final embeddings shape:", embeddings_array.shape)


In [None]:
!nvidia-smi

In [19]:
clean_embeddings = []
bad_indices = []

for i, emb in enumerate(embeddings_list):
    emb = np.array(emb)              # ensure ndarray
    emb = emb.reshape(-1)            # flatten to 1D
    if emb.shape[0] != 128:
        print("Fixing index", i, "shape:", emb.shape)
        emb = np.zeros(128, np.float32)   # replace invalid shapes
        bad_indices.append(i)
    clean_embeddings.append(emb)


Fixing index 1417 shape: (1,)
Fixing index 1655 shape: (1,)
Fixing index 2809 shape: (1,)
Fixing index 5606 shape: (1,)
Fixing index 9450 shape: (1,)
Fixing index 10197 shape: (1,)
Fixing index 10835 shape: (1,)
Fixing index 13767 shape: (1,)
Fixing index 18371 shape: (1,)
Fixing index 18454 shape: (1,)
Fixing index 18626 shape: (1,)
Fixing index 20174 shape: (1,)
Fixing index 23426 shape: (1,)
Fixing index 23458 shape: (1,)
Fixing index 24988 shape: (1,)
Fixing index 26409 shape: (1,)
Fixing index 27723 shape: (1,)
Fixing index 28678 shape: (1,)
Fixing index 28794 shape: (1,)
Fixing index 29492 shape: (1,)
Fixing index 30061 shape: (1,)
Fixing index 30539 shape: (1,)
Fixing index 30914 shape: (1,)
Fixing index 35198 shape: (1,)
Fixing index 36746 shape: (1,)
Fixing index 38458 shape: (1,)
Fixing index 41850 shape: (1,)


In [20]:
embeddings_array = np.vstack(clean_embeddings)
print("Final embeddings shape:", embeddings_array.shape)

Final embeddings shape: (43728, 128)


In [21]:
np.save("audio_embeddings2.npy", embeddings_array)

np.save("audio_filenames2.npy", np.array(files_list))

print("Saved embeddings to", "audio_embeddings2.npy")

Saved embeddings to audio_embeddings2.npy


### Final Embeds


In [22]:
import numpy 

In [24]:
data1 = np.load("audio_embeddings.npy")
data2 = np.load("audio_embeddings1.npy")
data3 = np.load("audio_embeddings2.npy")


print(f"Shape of Data 1 :- {data1.shape}")
print(f"Shape of Data 2 :- {data2.shape}")
print(f"Shape of Data 3 :- {data3.shape}")

Shape of Data 1 :- (16861, 128)
Shape of Data 2 :- (6087, 128)
Shape of Data 3 :- (43728, 128)


In [25]:
X_embeds = np.concat([data1 , data2 , data3] , axis=0)
print(f"Shape of embeds {X_embeds.shape}")

Shape of embeds (66676, 128)


In [26]:
caption1 = np.load("audio_filenames.npy")
caption2 = np.load("audio_filenames1.npy")
caption3 = np.load("audio_filenames2.npy")

print(f"Shape of caption 1 :- {caption1.shape}")
print(f"Shape of caption 2 :- {caption2.shape}")
print(f"Shape of caption 3 :- {caption3.shape}")

Shape of caption 1 :- (16861,)
Shape of caption 2 :- (6087,)
Shape of caption 3 :- (43728,)


In [29]:
y_filename = np.concat([caption1 , caption2 , caption3] , axis=0)
print(f"y captions shape :- {y_filename.shape}")

y captions shape :- (66676,)


In [31]:
data.iloc[:66676 , 4:8]

Unnamed: 0,caption,caption3,caption5,path
0,a woman laughs and speaks while birds vocalize...,woman laugh speak,woman laugh speak bird vocalize,data/rawaudio/pgLXVFvo5GI_27.wav
1,a large vehicle idling accompanied by rapid be...,large vehicle idle,large vehicle idle accompany rapid,data/rawaudio/pgN-Why-duY_0.wav
2,several ducks quack while some liquid splashes,duck quack liquid,duck quack liquid splash splash,data/rawaudio/pgSCmHYy0eQ_320.wav
3,hissing noises from steam followed by a man ta...,hiss noise steam,hiss noise steam follow man,data/rawaudio/pgVhbq6W3Ow_30.wav
4,an objects is being crumpled,object crumple crumple,object crumple crumple crumple crumple,data/rawaudio/pgYs-4Trnek_120.wav
...,...,...,...,...
66671,a woman is giving a speech in a foreign langua...,woman give speech,woman give speech foreign language,data/rawaudio/705wLhzLSD0_30.wav
66672,a train horn sounds as crossing bells ring,train horn sound,train horn sound cross bell,data/rawaudio/kllCTj91GKk_240.wav
66673,insects buzz while a man speaks,insect buzz man,insect buzz man speak speak,data/rawaudio/klmuMkxO6Z4_240.wav
66674,an engine runs and humming occurs followed by ...,engine run humming,engine run humming occur follow,data/rawaudio/klhCtuwiOHU_100.wav


In [None]:
incorrects = 0
paths = data.iloc[:66676, -1].values

for index, filepath in enumerate(y_filename):
    if filepath != paths[index]:
        incorrects += 1

print("Incorrect count:", incorrects)
for index, filepath in enumerate(y_filename):
    if filepath != paths[index]:
        incorrects += 1

print("Incorrect count:", incorrects)

Incorrect count: 43728


In [39]:
y_filename[0].item() == paths[0]

True

In [35]:
y_filename[0].item()

'data/rawaudio/pgLXVFvo5GI_27.wav'

In [44]:
import numpy as np
import pandas as pd

# numpy file paths
np_paths = y_filename  # shape (66676,)

# dataframe containing path and caption
df = data[['path', 'caption']]          # adjust column names if needed

# create lookup dictionary
path_to_caption = dict(zip(df['path'], df['caption']))

# build captions for each numpy path
captions = [path_to_caption.get(p, "NO_CAPTION") for p in np_paths]

captions = np.array(captions)
print(captions.shape)


(66676,)


In [48]:
captions[66673] , y_filename[66673]

(np.str_('a train horn sounds as crossing bells ring'),
 np.str_('data/rawaudio/kllCTj91GKk_240.wav'))

In [49]:
print(f"X_embeds shape :- {X_embeds.shape}")
print(f"y_filename shape :- {y_filename.shape}")
print(f"caption shape :- {captions.shape}")

X_embeds shape :- (66676, 128)
y_filename shape :- (66676,)
caption shape :- (66676,)


In [50]:
np.save("embeds.npy" , X_embeds)
np.save("filename.npy" , y_filename)
np.save("captions.npy" , captions)

In [None]:
# Load saved arrays
X_embeds = np.load("embeds.npy", allow_pickle=True)
y_filename = np.load("filename.npy", allow_pickle=True)
captions = np.load("captions.npy", allow_pickle=True)

# Print shapes
print("Embeddings shape:", X_embeds.shape)
print("Filenames shape:", y_filename.shape)
print("Captions shape:", captions.shape)

# Check they align
print("\nSample entries:")
print("Embedding vector shape:", X_embeds[0].shape)
print("Filename[0]:", y_filename[0])
print("Caption[0]:", captions[0])


Embeddings shape: (66676, 128)
Filenames shape: (66676,)
Captions shape: (66676,)

Sample entries:
Embedding vector shape: (128,)
Filename[0]: data/rawaudio/pgLXVFvo5GI_27.wav
Caption[0]: a woman laughs and speaks while birds vocalize and water splashes


### Creating sequence embeddings

In [40]:
import random 
paths = data['path'].tolist()
random.shuffle(paths)

In [41]:
len(paths)

91254

In [44]:

audio_folder = "data/rawaudio"         
save_file = "data/audio_embeddings.npy"     
device = "cpu"
model = vggish()
model.to(device)
model.eval()    

VGG(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
 

In [None]:
import numpy as np
import torch
import librosa
from tqdm import tqdm

embeddings_list = []
files_list = []

TARGET_PATCHES = 10   # desired number of patches

for idx, path in enumerate(tqdm(paths)):
    if idx > 10000 : 
        break 

    try:
        audio, sr = librosa.load(path, sr=16000, mono=True)
        examples = vggish_input.waveform_to_examples(audio, sr)   # (N, 96, 64)

        if examples.shape[0] == 0:
            print("Empty or too short, replacing with zeros:", path)
            emb = np.zeros((TARGET_PATCHES, 128), dtype=np.float32)

        else:
            examples = torch.tensor(examples).float()

            with torch.no_grad():
                emb_out = model(examples)     # (N, 128)

            N = emb_out.shape[0]

            # Pad or truncate to fixed length (10,128)
            if N < TARGET_PATCHES:
                pad = torch.zeros(TARGET_PATCHES - N, 128)
                emb = torch.cat([emb_out, pad], dim=0)  # (10, 128)
            elif N > TARGET_PATCHES:
                emb = emb_out[:TARGET_PATCHES, :]       # (10, 128)
            else:
                emb = emb_out                            # already correct shape

            emb = emb.numpy().astype(np.float32)

        embeddings_list.append(emb)   # (10*128 = 1280)
        files_list.append(path)

    except Exception as e:
        print("Error processing", path, e)
        embeddings_list.append(np.zeros(TARGET_PATCHES * 128, dtype=np.float32))
        files_list.append(path)
        continue

# Convert final embeddings list to numpy array
embeddings_array = np.vstack(embeddings_list)      # shape: (num_files, 1280)
print("Final embeddings shape:", embeddings_array.shape)


  examples = torch.tensor(examples).float()
  0%|          | 76/91254 [00:16<4:23:57,  5.76it/s]

Error processing data/rawaudio/QvJIJtzTfbk_0.wav too many indices for tensor of dimension 1


  1%|          | 999/91254 [03:11<4:39:04,  5.39it/s]

Error processing data/rawaudio/6MfLQf7E8iE_0.wav too many indices for tensor of dimension 1


  3%|▎         | 3069/91254 [09:53<3:54:40,  6.26it/s]

Error processing data/rawaudio/VcD4ezit_sM_0.wav too many indices for tensor of dimension 1


  5%|▍         | 4155/91254 [13:27<4:00:31,  6.04it/s]

Error processing data/rawaudio/d2Ak3vmuaPE_0.wav too many indices for tensor of dimension 1


  6%|▌         | 5161/91254 [16:41<3:44:20,  6.40it/s]

Error processing data/rawaudio/OmF-45c0B4E_0.wav too many indices for tensor of dimension 1


  8%|▊         | 6994/91254 [22:34<4:12:56,  5.55it/s]

Error processing data/rawaudio/iE571vq9UFI_0.wav too many indices for tensor of dimension 1


  8%|▊         | 7506/91254 [24:13<3:49:13,  6.09it/s]

Error processing data/rawaudio/TZLWayyX2TY_0.wav too many indices for tensor of dimension 1


  9%|▉         | 8380/91254 [27:02<3:23:46,  6.78it/s]

Empty or too short, replacing with zeros: data/rawaudio/_Ox52y_7Hoo_0.wav


 11%|█         | 10001/91254 [32:08<4:21:09,  5.19it/s]


Final embeddings shape: (10001, 1280)


In [46]:
embeddings_array = embeddings_array.reshape(-1, 10, 128)
print(embeddings_array.shape)

(10001, 10, 128)


In [61]:
np.save("token_embeddings.npy", embeddings_array)

np.save("token_files.npy", np.array(files_list))

print("Saved embeddings to", "token_embeddings.npy")

Saved embeddings to token_embeddings.npy


Mapping captions for time based data

In [34]:
import numpy as np
filepaths = np.load("token_files.npy")
filepaths.shape

(10001,)

In [None]:
import pandas as pd


np_paths = filepaths 
df = data[['path', 'caption']]          
path_to_caption = dict(zip(df['path'], df['caption']))

# build captions for each numpy path
captions = [path_to_caption.get(p, "NO_CAPTION") for p in np_paths]

captions = np.array(captions)
print(captions.shape)


(10001,)


In [36]:
captions

array(['a pig oinks nearby while a man speaks',
       'a man speaks, people talk in the background',
       'someone burps nearby while several people laugh', ...,
       'several birds chirp in the distance with some light clicks',
       'several frogs croaking loudly',
       'rustling occurs while wind blows and a man speaks as birds chirp'],
      shape=(10001,), dtype='<U206')

In [37]:
np.save("token_captions.npy" , captions)

In [3]:
import numpy as np
captions = np.load("filename.npy")
embeds = np.load("embeds.npy")
print(captions.shape)
print(embeds.shape)

(66676,)
(66676, 128)
