In [3]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from umap import UMAP

In [4]:
df = pd.read_json("/home/daniel/code/bengali-ai/bengali_ai_features_scalar.json")

In [5]:
df

Unnamed: 0,id,sentence,split,path,audio_rms_mean,audio_rms_max,audio_rms_std,audio_spectral_flatness_mean,audio_length_s
0,000005f3362c,ও বলেছে আপনার ঠিকানা!,train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.095909,0.237279,0.066077,0.000890,1.116009
1,00001dddd002,কোন মহান রাষ্ট্রের নাগরিক হতে চাও?,train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.021247,0.078481,0.018737,0.001006,2.448027
2,00001e0bc131,"আমি তোমার কষ্টটা বুঝছি, কিন্তু এটা সঠিক পথ না।",train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.035955,0.096530,0.026701,0.023012,4.716009
3,000024b3d810,নাচ শেষ হওয়ার পর সকলে শরীর ধুয়ে একসঙ্গে ভোজন...,train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.027113,0.226049,0.040667,0.022640,7.452018
4,000028220ab3,"হুমম, ওহ হেই, দেখো।",train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.118137,0.241331,0.079757,0.021770,2.160000
...,...,...,...,...,...,...,...,...,...
963631,ffffd07108b7,আপনার সাথে কথা বলতে চাই।,train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.107309,0.282542,0.085328,0.000881,2.340000
963632,ffffde37678a,সুতরাং পরের দিন আর-একটা ছবি না লইয়া চিত্রকর ছা...,train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.052331,0.229947,0.054987,0.019101,4.608027
963633,ffffe1b5f095,"সামাজিক কর্মকাণ্ডসমিতিতে গিয়ে দেখা যায়, শিল্পী...",train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.070625,0.182665,0.044776,0.004284,5.688027
963634,ffffec31636e,গুগল ম্যাপসের সাহায্যে খুঁজে পাওয়া যাবে কোন জা...,train,/home/daniel/data/bengaliai/bengaliai-speech/t...,0.079898,0.295988,0.072535,0.006693,5.400000


In [6]:
df.columns

Index(['id', 'sentence', 'split', 'path', 'audio_rms_mean', 'audio_rms_max',
       'audio_rms_std', 'audio_spectral_flatness_mean', 'audio_length_s'],
      dtype='object')

In [7]:

text_pkl_files = sorted(list(Path("/home/daniel/code/bengali-ai").glob("*_text_embeddings.pkl")))

all_text_embeddings = []

for text_pkl_file in text_pkl_files:
    with open(text_pkl_file, "rb") as f:
        embs = pickle.load(f)
    all_text_embeddings.append(embs)

text_embeddings = np.concatenate(all_text_embeddings, axis=0)

print(text_embeddings.shape)

(963636, 384)


In [8]:
for num_dimensions in range(138, text_embeddings.shape[1]):
    print(f"Trying num dimensions {num_dimensions}")
    pca = PCA(n_components=num_dimensions)
    pca.fit(text_embeddings)
    explained_variance = sum(pca.explained_variance_ratio_)
    print(f"Explained variance is {explained_variance}")
    if explained_variance >= 0.98:
        print(f"Found num dimensions {num_dimensions}")
        break

df["text_embedding"] = [e for e in pca.transform(text_embeddings)]

Trying num dimensions 135
Explained variance is 0.9792220090316912
Trying num dimensions 136
Explained variance is 0.9795272492117195
Trying num dimensions 137
Explained variance is 0.9798220671288702
Trying num dimensions 138
Explained variance is 0.9800726631260227
Found num dimensions 138


In [10]:
from sklearn.decomposition import IncrementalPCA
audio_pkl_files = sorted(list(Path("/home/daniel/code/bengali-ai").glob("*_audio_embeddings.pkl")))

inc_pca = IncrementalPCA(n_components=297)

for audio_pkl_file in audio_pkl_files:
    print(audio_pkl_file)
    with open(audio_pkl_file, "rb") as f:
        embs = pickle.load(f)
    inc_pca.partial_fit(embs)

print(sum(inc_pca.explained_variance_ratio_))

all_audio_embeddings = []

for audio_pkl_file in audio_pkl_files:
    with open(audio_pkl_file, "rb") as f:
        embs = pickle.load(f)
    all_audio_embeddings.append(inc_pca.transform(embs))

audio_embeddings = np.concatenate(all_audio_embeddings, axis=0)

/home/daniel/code/bengali-ai/000_audio_embeddings.pkl
/home/daniel/code/bengali-ai/001_audio_embeddings.pkl
/home/daniel/code/bengali-ai/002_audio_embeddings.pkl
/home/daniel/code/bengali-ai/003_audio_embeddings.pkl
/home/daniel/code/bengali-ai/004_audio_embeddings.pkl
/home/daniel/code/bengali-ai/005_audio_embeddings.pkl
/home/daniel/code/bengali-ai/006_audio_embeddings.pkl
/home/daniel/code/bengali-ai/007_audio_embeddings.pkl
/home/daniel/code/bengali-ai/008_audio_embeddings.pkl
/home/daniel/code/bengali-ai/009_audio_embeddings.pkl
/home/daniel/code/bengali-ai/010_audio_embeddings.pkl
/home/daniel/code/bengali-ai/011_audio_embeddings.pkl
/home/daniel/code/bengali-ai/012_audio_embeddings.pkl
/home/daniel/code/bengali-ai/013_audio_embeddings.pkl
/home/daniel/code/bengali-ai/014_audio_embeddings.pkl
/home/daniel/code/bengali-ai/015_audio_embeddings.pkl
/home/daniel/code/bengali-ai/016_audio_embeddings.pkl
/home/daniel/code/bengali-ai/017_audio_embeddings.pkl
/home/daniel/code/bengali-ai

In [20]:
df["audio_embedding"] = [e.astype(np.float32) for e in audio_embeddings]

In [12]:
df["audio"] = df["path"].str[45:]

In [13]:
df = df.drop(columns=["path"])

In [21]:
df

Unnamed: 0,id,sentence,split,audio_rms_mean,audio_rms_max,audio_rms_std,audio_spectral_flatness_mean,audio_length_s,text_embedding,audio_embedding,audio
0,000005f3362c,ও বলেছে আপনার ঠিকানা!,train,0.095909,0.237279,0.066077,0.000890,1.116009,"[-0.38994306, -0.097972825, -0.0056979237, 0.1...","[7.2484603, -7.543934, -4.7032595, -8.655085, ...",train_mp3s/000005f3362c.mp3
1,00001dddd002,কোন মহান রাষ্ট্রের নাগরিক হতে চাও?,train,0.021247,0.078481,0.018737,0.001006,2.448027,"[-0.1084951, -0.07227868, 0.053285215, -0.1264...","[11.617512, -0.12815115, -0.48695448, -2.82126...",train_mp3s/00001dddd002.mp3
2,00001e0bc131,"আমি তোমার কষ্টটা বুঝছি, কিন্তু এটা সঠিক পথ না।",train,0.035955,0.096530,0.026701,0.023012,4.716009,"[-0.1496445, -0.06911754, 0.11506425, -0.15017...","[0.13251832, 7.5647473, -4.9663234, -0.6222096...",train_mp3s/00001e0bc131.mp3
3,000024b3d810,নাচ শেষ হওয়ার পর সকলে শরীর ধুয়ে একসঙ্গে ভোজন...,train,0.027113,0.226049,0.040667,0.022640,7.452018,"[0.004609096, -0.010123715, -0.054717332, -0.0...","[-13.47718, 3.0267537, -4.3793607, 0.48654944,...",train_mp3s/000024b3d810.mp3
4,000028220ab3,"হুমম, ওহ হেই, দেখো।",train,0.118137,0.241331,0.079757,0.021770,2.160000,"[-0.36782098, 0.13273844, -0.16590148, 0.03617...","[-7.9069605, -4.8090887, -7.302672, -2.4339087...",train_mp3s/000028220ab3.mp3
...,...,...,...,...,...,...,...,...,...,...,...
963631,ffffd07108b7,আপনার সাথে কথা বলতে চাই।,train,0.107309,0.282542,0.085328,0.000881,2.340000,"[-0.21112299, -0.131941, 0.12897818, -0.110035...","[5.4855533, 5.2816706, -3.1930263, 7.416063, 2...",train_mp3s/ffffd07108b7.mp3
963632,ffffde37678a,সুতরাং পরের দিন আর-একটা ছবি না লইয়া চিত্রকর ছা...,train,0.052331,0.229947,0.054987,0.019101,4.608027,"[0.16658908, -0.026953176, 0.060520217, -0.065...","[5.314484, 5.8587527, 2.7001863, 6.3718195, 3....",train_mp3s/ffffde37678a.mp3
963633,ffffe1b5f095,"সামাজিক কর্মকাণ্ডসমিতিতে গিয়ে দেখা যায়, শিল্পী...",train,0.070625,0.182665,0.044776,0.004284,5.688027,"[0.2818417, -0.23723033, -0.018151812, 0.10785...","[2.5140448, -1.4130806, 4.5127854, 2.3490546, ...",train_mp3s/ffffe1b5f095.mp3
963634,ffffec31636e,গুগল ম্যাপসের সাহায্যে খুঁজে পাওয়া যাবে কোন জা...,train,0.079898,0.295988,0.072535,0.006693,5.400000,"[0.25751734, -0.099529, -0.060007177, -0.13138...","[-5.8520145, -0.5586258, -4.825342, 1.7372414,...",train_mp3s/ffffec31636e.mp3


In [58]:
df.to_parquet("bengaliai.parquet", index=False)