In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from random import seed, shuffle
import os

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

from sklearn.cluster import KMeans, SpectralClustering

from scipy import stats
from scipy.spatial.distance import directed_hausdorff, euclidean, cosine, pdist

from src.downsample import downsamp_audio
import src.embedding_extractor as ee
import src.dimension_reducer as dr
import src.distance_metrics as dm

from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

one_samp_toy_dir = '/Users/rahulbrito/Documents/projects/infantvoice/data/Full_Readings/one_samp_toy_downsamp' #toy dataset with one participant recording @16kHz
#two_samp_toy_dir = '/Users/rahulbrito/Documents/projects/infantvoice/data/Full_Readings/two_samp_toy_downsamp'#toy dataset with two participant recordings @16kHz
two_samp_toy_dir ='/Users/rahulbrito/Documents/projects/infantvoice/data/Full_Readings/020422_postpartum_moms_two_samp_toy'
down_sampled_dir = '/Users/rahulbrito/Documents/projects/infantvoice/data/Full_Readings/downsamp'#all the data, downsampled to 16kHz
embedding_dir = '/Users/rahulbrito/Documents/projects/infantvoice/data/embeddings' #location of embeddings save new embeddings load pre-generated ones from here
emb_models = ['emb_ami', 'emb','emb_voxceleb'] #names of pretrained embedding extractor models

#the directory has other csvs and i only want today's
file = [file for file in os.listdir(embedding_dir) if file.startswith("020322")] 

#create a dictionary where each item is the pd df of the embeddings extracted from that model
all_embs = {emb_models[index]: ee.load_embs(os.path.join(embedding_dir,file[index])) for index in np.arange(len(file))}

emb = all_embs[emb_models[1]]
emb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,part_id
0,-0.123332,-0.212858,0.579420,-0.097578,-0.348416,-0.181423,-0.026446,0.004830,0.373562,0.492513,...,-0.033463,0.539184,0.248723,-0.006912,0.885185,0.347235,-0.088158,0.350835,0.215521,2
1,-0.175574,-0.190697,0.373442,0.153301,-0.252046,-0.000443,-0.174977,0.069726,0.340666,0.484775,...,-0.019034,0.511352,0.304647,-0.250586,0.773904,0.265578,0.012553,0.119442,0.318374,2
2,-0.358425,-0.182912,0.427072,0.239118,-0.177666,0.021987,-0.171845,-0.052981,0.377177,0.213617,...,-0.019701,0.502201,0.067791,-0.121142,0.553801,0.351367,0.090260,0.035532,0.362458,2
3,-0.231662,-0.213927,0.421650,0.160902,-0.164612,-0.046126,-0.216291,-0.026678,0.370791,0.212087,...,-0.034590,0.474284,0.079180,-0.239202,0.653752,0.351659,-0.139255,0.085416,0.336875,2
4,-0.001955,-0.315279,0.244028,0.060818,-0.213323,-0.179864,-0.125265,-0.049915,0.100376,0.275971,...,-0.023115,0.279722,0.042393,-0.179478,0.528186,0.239384,-0.267313,0.082390,0.216767,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,0.003837,-0.507340,0.241913,-0.064185,-0.116934,-0.340669,-0.225480,0.080772,0.469424,0.053858,...,-0.183640,0.102412,0.062516,-0.025111,0.107646,-0.047706,-0.055400,0.004393,-0.121033,18
951,0.041177,-0.217801,0.224912,0.174014,0.016183,-0.445073,-0.255178,0.310098,0.374388,-0.106795,...,-0.077296,0.195402,-0.358911,0.003702,0.440113,-0.225163,-0.264236,0.187272,-0.073514,18
952,-0.045413,-0.043777,-0.108840,0.190284,0.163413,-0.299228,-0.278333,0.273654,0.440643,-0.249633,...,0.131392,-0.187499,-0.336781,0.121000,0.344359,-0.287923,-0.294542,0.024769,-0.139907,18
953,-0.155613,-0.153981,-0.209405,0.260270,0.114834,-0.198231,-0.230558,0.270685,0.540873,-0.076377,...,-0.065397,0.112729,-0.216147,0.085064,0.263484,-0.390468,-0.101311,-0.104285,-0.192161,18


In [4]:
emb_down = ee.resample_data(emb,1)
emb_down

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,part_id
0,0.251402,-0.113531,-0.026699,0.108581,0.142024,0.418977,0.237859,0.393980,0.273138,-0.397868,...,-0.113664,0.304549,-0.165469,-0.016979,-0.103571,-0.282516,0.072623,0.202079,-0.244439,1.0
1,0.175649,0.415603,-0.185651,0.492292,0.439957,0.255900,-0.034511,0.590731,-0.008688,-0.011961,...,-0.161451,0.153969,-0.293978,-0.078539,-0.315034,0.137735,-0.146024,-0.051389,0.379904,1.0
2,0.295241,0.123536,0.171945,0.360826,0.111145,0.569962,0.052106,0.494581,0.391834,0.073833,...,-0.108068,0.377034,-0.187074,-0.003769,-0.031741,-0.147246,0.137283,0.247948,-0.004698,1.0
3,0.059938,0.326336,-0.095426,0.444787,-0.013973,0.295093,0.448570,0.874552,0.491555,-0.118922,...,-0.196050,0.207501,-0.116073,0.288460,-0.313195,-0.366917,-0.024117,0.281159,-0.002636,1.0
4,0.113114,0.438121,0.162601,0.528759,0.453783,0.189585,0.178091,0.523906,-0.006883,-0.180963,...,-0.393318,0.088066,-0.434710,-0.324366,-0.093148,0.329404,0.120588,0.165349,0.472863,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,-0.235623,0.035262,0.044858,0.003117,-0.269843,-0.342872,-0.224355,0.143299,0.650434,-0.047440,...,-0.087205,0.193972,-0.260538,-0.027326,-0.153180,-0.207091,0.034564,-0.232545,-0.078117,18.0
951,-0.275940,0.141460,-0.078053,0.108069,0.028386,-0.114150,-0.055735,0.299621,0.246936,-0.366271,...,-0.199632,-0.243242,0.049068,-0.050638,0.179877,-0.244552,-0.118363,0.135510,-0.271970,18.0
952,-0.325205,0.146840,0.001157,-0.120821,-0.008782,-0.419672,-0.154488,0.149923,0.771798,0.170128,...,0.278863,-0.208031,-0.149460,-0.037691,0.469687,0.061540,-0.168367,-0.141447,-0.149901,18.0
953,-0.122438,-0.133445,-0.088729,0.003440,0.223398,0.102164,0.005353,0.132023,0.598925,-0.070919,...,0.209009,-0.019185,0.268690,-0.090822,0.208030,-0.029278,-0.406017,-0.105736,-0.236721,18.0


In [8]:
#Using tsne with perplexity=30 (default) since N/100<30, and learning rate of n/12
emb_tsne = dr.run_tsne(emb_down, perplexity=0, init='pca', learning_rate=emb_down.shape[0]//12)
emb_tsne

Unnamed: 0,dim0,dim1,part_id
0,96.835045,68.066704,1.0
1,-26.872305,84.733101,1.0
2,18.414139,131.738770,1.0
3,62.448788,55.836765,1.0
4,-76.956665,119.057587,1.0
...,...,...,...
950,-108.116798,74.565826,18.0
951,1.226079,-66.481216,18.0
952,83.504898,-12.157448,18.0
953,-122.835480,70.939842,18.0


In [9]:
emb_umap = dr.run_umap(emb_down)
emb_umap

OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Unnamed: 0,dim0,dim1,part_id
0,6.299535,21.864796,1.0
1,6.528137,21.635954,1.0
2,6.324068,21.840216,1.0
3,6.283904,21.880379,1.0
4,6.562484,21.601578,1.0
...,...,...,...
950,-3.553475,-8.973348,18.0
951,-3.780457,-9.019242,18.0
952,-3.497584,-8.530836,18.0
953,-3.373391,-9.035958,18.0


In [24]:
high_dim = emb_down

two_dim = emb_tsne
dist_tsne = dm.embedding_quality(
    high_dim.drop(columns='part_id').to_numpy(), 
    two_dim.drop(columns='part_id').to_numpy(), 
    high_dim.part_id.to_numpy(),
    'tsne',
    subsetsize=emb_down.shape[0])

two_dim = emb_umap
dist_umap = dm.embedding_quality(
    high_dim.drop(columns='part_id').to_numpy(), 
    two_dim.drop(columns='part_id').to_numpy(), 
    high_dim.part_id.to_numpy(),
    'umap',
    subsetsize=emb_down.shape[0])

In [25]:
#'knn', 'knc', 'cpd' are micro, meso, and marco structure per Kobak&Berens2019
pd.DataFrame(np.vstack((dist_tsne, dist_umap)), columns = ['micro', 'meso', 'macro','dim_reduc'], index = ['tsne', 'umap'])

Unnamed: 0,micro,meso,macro,dim_reduc
tsne,0.1414659685863874,0.6666666666666667,0.5050961668882564,tsne
umap,0.5025130890052356,0.611111111111111,0.2601195517937911,umap
