In [1]:
%load_ext autoreload
%autoreload 2

# Exploratory Data Analysis - Bengali.AI Kaggle Competition

In [4]:
# Install these dependencies
!pip install -U sliceguard pandas mutagen tqdm librosa

Collecting sliceguard
  Using cached sliceguard-0.0.10-py3-none-any.whl (17 kB)
Collecting pandas
  Using cached pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Collecting mutagen
  Using cached mutagen-1.46.0-py3-none-any.whl (193 kB)
Collecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting librosa
  Using cached librosa-0.10.0.post2-py3-none-any.whl (253 kB)
Collecting hnne>=0.1.8
  Using cached hnne-0.1.9.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting umap-learn>=0.5.3
  Using cached umap-learn-0.5.3.tar.gz (88 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting plotly>=5.15.0
  Using cached plotly-5.15.0-py2.py3-none-any.whl (15.5 MB)
Collecting scikit-learn>=1.2.2
  Using cached scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
Collecting datasets>=2.13.1
  Using cached datasets-2.14.2-py3-none-any.whl (518 kB)
Collecting torchaudio>=2.0.2
 

In [2]:
# The imports you will need
import pickle
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import librosa
from mutagen.mp3 import MP3

# from renumics import spotlight
# from renumics.spotlight import Audio, Embedding
# from sliceguard import SliceGuard
from sliceguard.embeddings import generate_audio_embeddings, generate_text_embeddings

In [3]:
# Configure the path to your dataset here
DATASET_DIR = "/home/daniel/data/bengaliai/bengaliai-speech"
dataset_dir = Path(DATASET_DIR)

In [4]:
# Load the data
df = pd.read_csv(dataset_dir / "train.csv")
# Generate the audio paths
df["path"] = str(dataset_dir / "train_mp3s") + "/" + df["id"] + ".mp3"

In [5]:
# This is just for running the code more quickly in dev
# df = df.sample(50)

In [6]:
len(df)

963636

# Feature Extraction

## Simple Audio Features

In [8]:
# Simple audio features

# Audio level
rms_means = []
rms_maxs = []
rms_stds = []
spectral_flatness_means = []
audio_lengths = []
for p in tqdm(df["path"], total=len(df)):
    y, sr = librosa.load(p)
    rms = librosa.feature.rms(y=y)
    rms_means.append(rms.mean())
    rms_maxs.append(rms.max())
    rms_stds.append(rms.std())
    spectral_flatness_means.append(librosa.feature.spectral_flatness(y=y).mean())
    audio_lengths.append(y.shape[0] / sr)

df["audio_rms_mean"] = rms_means
df["audio_rms_max"] = rms_maxs
df["audio_rms_std"] = rms_stds
df["audio_spectral_flatness_mean"] = spectral_flatness_means
df["audio_length_s"] = audio_lengths

# df.to_json("bengali_ai_features_scalar.json")

  0%|                                    | 298/963636 [00:03<2:58:52, 89.76it/s]


KeyboardInterrupt: 

## Audio Embedding Extraction

In [10]:
# audio_embeddings = generate_audio_embeddings(df["path"].values)

# with open("audio_embeddings.pkl", "wb") as f:
#     pickle.dump(audio_embeddings, f)
chunk_size = 10000

for i in range(0, len(df), chunk_size):
    chunk_id = i // chunk_size
    chunk_file = f"{chunk_id:03d}_audio_embeddings.pkl"
    print(f"Generating {chunk_file}...")
    if not Path(chunk_file).is_file():
        chunk_audio_embeddings = generate_audio_embeddings(df["path"].values[i:i+chunk_size])
        with open(chunk_file, "wb") as f:
            pickle.dump(chunk_audio_embeddings, f)
    

Generating 000_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 001_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 002_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 003_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 004_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 005_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 006_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 007_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 008_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 009_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 010_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 011_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 012_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 013_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 014_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 015_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 016_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 017_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 018_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 019_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 020_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 021_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 022_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 023_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 024_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 025_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 026_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 027_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 028_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 029_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 030_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 031_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 032_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 033_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 034_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 035_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 036_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 037_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 038_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 039_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 040_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 041_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 042_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 043_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 044_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 045_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 046_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 047_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 048_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 049_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 050_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 051_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 052_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 053_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 054_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 055_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 056_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 057_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 058_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 059_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 060_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 061_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 062_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 063_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 064_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 065_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 066_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 067_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 068_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 069_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 070_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 071_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 072_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 073_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 074_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 075_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 076_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 077_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 078_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 079_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 080_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 081_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 082_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 083_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 084_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 085_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 086_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 087_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 088_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 089_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 090_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 091_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 092_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 093_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 094_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 095_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating 096_audio_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.


Map:   0%|          | 0/3636 [00:00<?, ? examples/s]

## Text Embedding Extraction

In [8]:
# text_embeddings = generate_text_embeddings(df["sentence"].values)

# with open("text_embeddings.pkl", "wb") as f:
#     pickle.dump(text_embeddings, f)

chunk_size = 10000

for i in range(0, len(df), chunk_size):
    chunk_id = i // chunk_size
    chunk_file = f"{chunk_id:03d}_text_embeddings.pkl"
    print(f"Generating {chunk_file}...")
    if not Path(chunk_file).is_file():
        chunk_text_embeddings = generate_text_embeddings(df["sentence"].values[i:i+chunk_size])
        with open(chunk_file, "wb") as f:
            pickle.dump(chunk_text_embeddings, f)

Generating 000_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 001_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 002_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 003_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 004_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 005_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 006_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 007_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessing None.
Generating 008_text_embeddings.pkl...
Embedding computation on cuda with batch size 1 and multiprocessin

In [None]:
df["audio_embedding"] = [e.tolist() for e in audio_embeddings]
df["text_embedding"] = [e.tolist() for e in text_embeddings]

df.to_json("bengali_ai_features.json")

In [10]:
all_text_embeddings = []

chunk_size = 10000

for i in range(0, len(df), chunk_size):
    chunk_id = i // chunk_size
    chunk_file = f"{chunk_id:03d}_text_embeddings.pkl"
    print(f"Loading {chunk_file}...")
    with open(chunk_file, "rb") as f:
        chunk_text_embeddings = pickle.load(f)
    all_text_embeddings.append(chunk_text_embeddings)

Loading 000_text_embeddings.pkl...
Loading 001_text_embeddings.pkl...
Loading 002_text_embeddings.pkl...
Loading 003_text_embeddings.pkl...
Loading 004_text_embeddings.pkl...
Loading 005_text_embeddings.pkl...
Loading 006_text_embeddings.pkl...
Loading 007_text_embeddings.pkl...
Loading 008_text_embeddings.pkl...
Loading 009_text_embeddings.pkl...
Loading 010_text_embeddings.pkl...
Loading 011_text_embeddings.pkl...
Loading 012_text_embeddings.pkl...
Loading 013_text_embeddings.pkl...
Loading 014_text_embeddings.pkl...
Loading 015_text_embeddings.pkl...
Loading 016_text_embeddings.pkl...
Loading 017_text_embeddings.pkl...
Loading 018_text_embeddings.pkl...
Loading 019_text_embeddings.pkl...
Loading 020_text_embeddings.pkl...
Loading 021_text_embeddings.pkl...
Loading 022_text_embeddings.pkl...
Loading 023_text_embeddings.pkl...
Loading 024_text_embeddings.pkl...
Loading 025_text_embeddings.pkl...
Loading 026_text_embeddings.pkl...
Loading 027_text_embeddings.pkl...
Loading 028_text_emb

In [9]:
all_audio_embeddings = []

chunk_size = 10000

for i in range(0, len(df), chunk_size):
    chunk_id = i // chunk_size
    chunk_file = f"{chunk_id:03d}_audio_embeddings.pkl"
    print(f"Loading {chunk_file}...")
    with open(chunk_file, "rb") as f:
        chunk_audio_embeddings = pickle.load(f)
    all_audio_embeddings.append(chunk_audio_embeddings)

Loading 000_audio_embeddings.pkl...


FileNotFoundError: [Errno 2] No such file or directory: '000_audio_embeddings.pkl'

# EDA

## Load the Feature-enriched Data

In [None]:
df = pd.read_json("bengali_ai_features.json")

## Basics about the raw data

In [None]:
# Sample count and columns
print(f"Sample count is {len(df)}.")
print(f"Dataframe contains the columns {df.columns.tolist()}.")

In [None]:
# Split ratio
print("##### Distribution between splits #####")
px.histogram(df, x="split")

In [None]:
# Duplicates
print("##### Distribution on exact duplicates #####")
px.histogram(df["sentence"].value_counts())

## Simple Audio Features

In [None]:
print("##### Distribution of audio lengths #####")
audio_length_fig = px.histogram(df, x="audio_length_s")
audio_length_fig.show()

print("##### Distribution of rms means #####")
audio_rms_means_fig = px.histogram(df, x="audio_rms_mean")
audio_rms_means_fig.show()

print("##### Distribution of rms maxs #####")
audio_rms_maxs_fig = px.histogram(df, x="audio_rms_max")
audio_rms_maxs_fig.show()

print("##### Distribution of rms stds #####")
audio_rms_stds_fig = px.histogram(df, x="audio_rms_std")
audio_rms_stds_fig.show()


print("##### Distribution of spectral flatness #####")
audio_spectral_flatness_fig = px.histogram(df, x="audio_spectral_flatness_mean")
audio_spectral_flatness_fig.show()



## Audio Outliers

In [None]:
sg = SliceGuard()
sg.show(df, ["audio_embedding"], precomputed_embeddings={"audio_embedding": np.vstack(df["audio_embedding"])}, drop_reference="parent")

In [None]:
sg = SliceGuard()
sg.find_issues(df, ["audio_embedding"], min_drop=0.02, min_support=5, drop_reference="parent", precomputed_embeddings={"audio_embedding": np.vstack(df["audio_embedding"])})
sg.report()

## Text Outliers

In [None]:
sg = SliceGuard()
sg.show(df, ["text_embedding"], precomputed_embeddings={"text_embedding": np.vstack(df["text_embedding"])}, drop_reference="parent")

In [None]:
sg = SliceGuard()
sg.find_issues(df, ["text_embedding"], min_drop=0.1, min_support=10, precomputed_embeddings={"text_embedding": np.vstack(df["text_embedding"])}, drop_reference="parent")
sg.report()

# Free EDA in Spotlight

In [None]:
spotlight.show(df, dtype={"audio_embedding": Embedding, "text_embedding": Embedding, "path": Audio})