In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil
from tqdm import tqdm, trange

In [2]:
TRAIN_CSV = 'dataset/mswc21/csv/TRAIN.csv'
VAL_CSV = 'dataset/mswc21/csv/VAL.csv'
PRE_CSVS = 'dataset/mswc21/experiments/kmeans_fbanks'
languages = ['cs', 'uk', 'id', 'et']

train_data = pd.read_csv(TRAIN_CSV, delimiter=',')
val_data = pd.read_csv(VAL_CSV, delimiter=',')

# train_data = train_data[train_data['language'].isin(languages)]
# val_data = val_data[val_data['language'].isin(languages)]

In [42]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
0,25890,train,этого,ru/clips/этого/common_voice_ru_18956507.wav,ru,m
1,61066,train,резолюции,ru/clips/резолюции/common_voice_ru_19157550.wav,ru,m
2,48765,train,менее,ru/clips/менее/common_voice_ru_19288549.wav,ru,m
3,654917,train,kunnen,nl/clips/kunnen/common_voice_nl_22221094.wav,nl,m
4,611222,val,door,nl/clips/door/common_voice_nl_17709306.wav,nl,m


In [43]:
val_data.head()

Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
0,31559,train,někdo,cs/clips/někdo/common_voice_cs_21312096.wav,cs,m
1,4261,val,може,uk/clips/може/common_voice_uk_21567719.wav,uk,n
2,14527,train,gvern,mt/clips/gvern/common_voice_mt_21172642.wav,mt,m
3,82364,train,jepang,id/clips/jepang/common_voice_id_19802457.wav,id,m
4,2294,train,untuk,id/clips/untuk/common_voice_id_19192652.wav,id,m


In [3]:
counts = val_data['label'].value_counts()
# big_labels = 
counts

saya       1000
inte       1000
jsem       1000
questa     1000
buvo        987
           ... 
meitsje     100
saat        100
võiks       100
прошу       100
чому        100
Name: label, Length: 510, dtype: int64

In [4]:
def print_stats(data):
    print(f'len: {len(data)}')
    grouped = data.groupby(['label', 'mode']).count()
    print(grouped)

In [46]:
print_stats(train_data)

len: 688524
              Unnamed: 0  path  language  gender
label  mode                                     
aantal train          88    88        88      88
       val            24    24        24      24
acho   train         151   151       151     151
       val            38    38        38      38
acht   train         137   137       137     137
...                  ...   ...       ...     ...
әллә   val            19    19        19      19
әмма   train          82    82        82      82
       val            20    20        20      20
өчен   train         402   402       402     402
       val            71    71        71      71

[4124 rows x 4 columns]


In [47]:
print_stats(val_data)

len: 101654
               Unnamed: 0  path  language  gender
label   mode                                     
aasta   train         118   118       118     118
        val            30    30        30      30
aceasta train         148   148       148     148
        val            37    37        37      37
această train         191   191       191     191
...                   ...   ...       ...     ...
فضلك    val            42    42        42      42
ماذا    train         119   119       119     119
        val            30    30        30      30
هناك    train         171   171       171     171
        val            44    44        44      44

[1020 rows x 4 columns]


In [5]:
label_data = val_data[val_data['language'].isin(languages)]
print(len(label_data['label'].unique()))

237


In [6]:
np.random.seed(29)
val_labels = np.random.choice(val_data['label'].unique(), replace=False, size=100)
# val_labels = 
print(val_labels)

['každý' 'herkes' 'yılında' 'waard' 'buvo' 'sekiz' 'kaks' 'vara' 'тому'
 'takže' 'musí' 'андан' 'kokku' 'чейин' 'алып' 'tema' 'avro' 'lalu' 'bazı'
 'hija' 'ingen' 'nikdy' 'hästi' 'dokuz' 'väga' 'mille' 'wiedział' 'diğer'
 'лише' 'inimesed' 'adalah' 'kolla' 'saat' 'добре' 'něco' 'болчу' 'lucru'
 'yeni' 'täna' 'avem' 'rohkem' 'jestli' 'fakat' 'mida' 'ještě' 'praegu'
 'meta' 'азыр' 'göre' 'ministru' 'dobře' 'життя' 'إنها' 'selle' 'akan'
 'varför' 'banyak' 'اليوم' 'hade' 'acestea' 'kadar' 'βασιλιάς' 'peaks'
 'kabul' 'мене' 'люди' 'рішення' 'sest' 'bunun' 'kosova' 'kogu' 'başladı'
 'olnud' 'bile' 'менин' 'datang' 'ilgili' 'тобто' 'türkiye' 'эмес' 'enne'
 'jier' 'βασιλόπουλο' 'элек' 'metu' 'bych' 'tomu' 'böyle' 'inte' 'дуже'
 'sona' 'sõnul' 'önemli' 'biri' 'право' 'jekk' 'дагы' 'україні' 'його'
 'mhux']


In [7]:
def split_target(data, count, total_count):
    shuffled = data.sample(frac=1)
    target_df = (pd.concat([shuffled.iloc[:count, :]] * ((total_count + count - 1) // count), ignore_index=True)).iloc[:total_count, :]
    target_val_df = shuffled.iloc[count:, :]
    target_df['mode'] = 'train'
    target_val_df['mode'] = 'val'
    return target_df, target_val_df

def create_random_test(train_data, val_data, target_data, non_target_val_data, target, target_base_count, target_total_count, non_target_train_count):
    targets_train, targets_val = split_target(target_data, target_base_count, target_total_count)
    non_target_train = train_data.sample(non_target_train_count)
    total_df = pd.concat([targets_train, targets_val, non_target_train, non_target_val_data], ignore_index=True).sample(frac=1)
    return total_df

In [90]:
# df_path = 'dataset/mswc21/experiments/kmeans_all_1024_cs_uk_id_et/herkes_0.csv'
# temp_df = pd.read_csv(df_path, delimiter=',')
# print(temp_df['label'].value_counts())
# print(temp_df['mode'].value_counts())
# print(temp_df[temp_df['mode']=='train']['label'].value_counts())
# print(temp_df[temp_df['mode']=='val']['label'].value_counts())


_unknown    10200
target        202
Name: label, dtype: int64
val      10102
train      300
Name: mode, dtype: int64
_unknown    200
target      100
Name: label, dtype: int64
_unknown    10000
target        102
Name: label, dtype: int64


In [69]:
target_trains_and_vals = {}
directory = 'dataset/mswc21/experiments/kmeans_all_1024_cs_uk_id_et'
for name in os.listdir(directory):
    target_df = pd.read_csv(os.path.join(directory, name), delimiter=',')
    target_df = target_df[target_df['label']=='target']
    target_trains_and_vals[name.replace('.csv', '')] = target_df[target_df['mode'] == 'train'], target_df[target_df['mode']=='val'] 
print(list(target_trains_and_vals.keys())[:10])
display((target_trains_and_vals[list(target_trains_and_vals.keys())[0]]))

['kolla_6', 'biri_2', 'андан_4', 'добре_4', 'дуже_5', 'kolla_7', 'göre_9', 'jekk_0', 'kosova_0', 'рішення_6']


(       Unnamed: 0  Unnamed: 0.1   mode   label  \
 14             97         56539  train  target   
 114            12         65214  train  target   
 134            90         63906  train  target   
 199            58         32102  train  target   
 404            82         65214  train  target   
 ...           ...           ...    ...     ...   
 9796           52         65214  train  target   
 10064          80         63906  train  target   
 10181           8         32102  train  target   
 10260          85         14427  train  target   
 10278          96         33251  train  target   
 
                                                  path language gender  
 14     sv/clips/kolla/common_voice_sv-SE_21922994.wav       sv      m  
 114    sv/clips/kolla/common_voice_sv-SE_21923055.wav       sv      m  
 134    sv/clips/kolla/common_voice_sv-SE_22314170.wav       sv      m  
 199    sv/clips/kolla/common_voice_sv-SE_21577617.wav       sv      n  
 404    sv/clips/koll

In [8]:
def create_tests(train_data, val_data, val_labels, target_base_count, target_total_count, non_target_train_count, val_count, exp_count, pre_trains_dir):
    np.random.seed(29)
    train_non_target_val = train_data[train_data['mode']=='val'].sample(n=val_count // 2)
    train_train_data = train_data[train_data['mode'] == 'train']
    all_targets = val_data[val_data['label'].isin(val_labels)]
    val_data_clean = val_data[~val_data['label'].isin(val_labels)]
    val_non_target_val = val_data_clean[val_data_clean['mode']=='val'].sample(n=val_count//2)
    all_val = pd.concat([train_non_target_val, val_non_target_val], ignore_index=True).sample(frac=1)
    result = {x: list() for x in val_labels}
    target_dfs = {x: all_targets[all_targets['label'] == x] for x in val_labels}
    for i in trange(exp_count):
        if not pre_trains_dir:
            non_target_train = train_train_data.sample(n=non_target_train_count)
        else:
            non_target_train = pd.read_csv(os.path.join(pre_trains_dir, f'{i}.csv'), delimiter=',')
#             non_target_train = pd.read_csv(os.path.join(pre_trains_dir, f'{i}.csv'), delimiter=',').sample(n=non_target_train_count)
        for label in tqdm(val_labels):
#             targets_train, targets_val = target_trains_and_vals[f'{label}_{i}']
            targets_train, targets_val = split_target(target_dfs[label], target_base_count, target_total_count)
            total_df = pd.concat([targets_train, targets_val, non_target_train, all_val], ignore_index=True).sample(frac=1)
            result[label].append(total_df)
    return result

In [9]:
pd.options.mode.chained_assignment = None 
dfs = create_tests(train_data, val_data, val_labels, 10, 100, 200, 10000, 10, PRE_CSVS)

  0%|                                                    | 0/10 [00:00<?, ?it/s]
  0%|                                                   | 0/100 [00:00<?, ?it/s][A
 31%|████████████▋                            | 31/100 [00:00<00:00, 304.91it/s][A
 62%|█████████████████████████▍               | 62/100 [00:00<00:00, 301.72it/s][A
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 294.27it/s][A
 10%|████▍                                       | 1/10 [00:00<00:03,  2.89it/s]
  0%|                                                   | 0/100 [00:00<?, ?it/s][A
 28%|███████████▍                             | 28/100 [00:00<00:00, 267.41it/s][A
 58%|███████████████████████▊                 | 58/100 [00:00<00:00, 285.41it/s][A
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 289.09it/s][A
 20%|████████▊                                   | 2/10 [00:00<00:02,  2.86it/s]
  0%|                                                   | 0/100 [00:00<?, ?it/s][A
 

In [10]:
test = dfs['дуже'][0]
print(len(test))
print(test[test['mode']=='train'])
print(test[test['label']=='target'])

10505
     Unnamed: 0   mode       label  \
439      226670  train  wszystkich   
14        88110  train        дуже   
63        39002  train        дуже   
476      193010  train       piano   
65         9650  train        дуже   
..          ...    ...         ...   
392      155538  train        juna   
15         9650  train        дуже   
22        81991  train        дуже   
445      335107  train      euskal   
27        81622  train        дуже   

                                                path language gender  
439  pl/clips/wszystkich/common_voice_pl_21131676.pt       pl      m  
14        uk/clips/дуже/common_voice_uk_21361279.wav       uk      m  
63        uk/clips/дуже/common_voice_uk_22047582.wav       uk      m  
476       it/clips/piano/common_voice_it_20667128.pt       it      m  
65        uk/clips/дуже/common_voice_uk_20935760.wav       uk      m  
..                                               ...      ...    ...  
392        eo/clips/juna/common_voice_eo

In [12]:
PATH = 'dataset/mswc21/experiments/fbanks_cs_uk_id_et' 
os.makedirs(os.path.join(PATH), exist_ok=True)
for label in tqdm(val_labels):
    for i in trange(len(dfs[label])):
        dfs[label][i]['label'] = dfs[label][i]['label'].apply(lambda x: 'target' if x == label or x == 'target' else '_unknown')
        dfs[label][i].to_csv(os.path.join(PATH, f'{label}_{i}.csv'))

  0%|                                                   | 0/100 [00:00<?, ?it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 38.96it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 38.11it/s][A
  1%|▍                                          | 1/100 [00:00<00:26,  3.78it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 30%|█████████████▏                              | 3/10 [00:00<00:00, 28.61it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.37it/s][A
  2%|▊                                          | 2/100 [00:00<00:28,  3.48it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 37.50it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 35.76it/s][A
 

 40%|█████████████████▌                          | 4/10 [00:00<00:00, 34.58it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 32.90it/s][A
 25%|██████████▌                               | 25/100 [00:07<00:22,  3.41it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 35.59it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 36.10it/s][A
 26%|██████████▉                               | 26/100 [00:07<00:21,  3.45it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 34.84it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 35.26it/s][A
 27%|███████████▎                              | 27/100 [00:07<00:21,  3.46it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 

 49%|████████████████████▌                     | 49/100 [00:14<00:14,  3.45it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 33.72it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.49it/s][A
 50%|█████████████████████                     | 50/100 [00:14<00:14,  3.40it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 34.58it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 36.41it/s][A
 51%|█████████████████████▍                    | 51/100 [00:14<00:14,  3.45it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 39.14it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 37.30it/s][A
 

  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 36.82it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 36.09it/s][A
 74%|███████████████████████████████           | 74/100 [00:21<00:07,  3.52it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 39.76it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 38.06it/s][A
 75%|███████████████████████████████▌          | 75/100 [00:21<00:06,  3.59it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 37.63it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 37.34it/s][A
 76%|███████████████████████████████▉          | 76/100 [00:21<00:06,  3.62it/s]
 

  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 35.29it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 35.47it/s][A
 98%|█████████████████████████████████████████▏| 98/100 [00:28<00:00,  3.49it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 30%|█████████████▏                              | 3/10 [00:00<00:00, 24.20it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 31.76it/s][A
 99%|█████████████████████████████████████████▌| 99/100 [00:28<00:00,  3.37it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 29.95it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 32.26it/s][A
100%|█████████████████████████████████████████| 100/100 [00:29<00:00,  3.44it/s]


In [109]:
test_dfs = dfs['každý']
test_0 = test_dfs[1]
test_0 = test_0[test_0['label'] != 'každý']
test_0 = test_0[test_0['mode']=='train']
display(test_0.sort_values(by=['path']))

# test_1 = test_dfs[1]
test_1 = dfs['herkes'][1]
test_1 = test_1[test_1['label'] != 'herkes']
test_1 = test_1[test_1['mode']=='train']
display(test_1.sort_values(by=['path']))

Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
243,501534,train,ankaŭ,eo/clips/ankaŭ/common_voice_eo_20884350.wav,eo,m
237,491123,train,antaŭ,eo/clips/antaŭ/common_voice_eo_18997147.wav,eo,m
280,493159,train,bela,eo/clips/bela/common_voice_eo_20486844.wav,eo,m
258,487469,train,denove,eo/clips/denove/common_voice_eo_19841710.wav,eo,m
263,507092,train,devas,eo/clips/devas/common_voice_eo_19308747.wav,eo,m
...,...,...,...,...,...,...
253,684977,train,белән,tt/clips/белән/common_voice_tt_18879708__2.wav,tt,m
359,677446,train,кайтып,tt/clips/кайтып/common_voice_tt_17786939.wav,tt,m
307,670351,train,тиеш,tt/clips/тиеш/common_voice_tt_17359125.wav,tt,m
414,685452,train,хәзер,tt/clips/хәзер/common_voice_tt_17523221.wav,tt,m


Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
228,501534,train,ankaŭ,eo/clips/ankaŭ/common_voice_eo_20884350.wav,eo,m
222,491123,train,antaŭ,eo/clips/antaŭ/common_voice_eo_18997147.wav,eo,m
265,493159,train,bela,eo/clips/bela/common_voice_eo_20486844.wav,eo,m
243,487469,train,denove,eo/clips/denove/common_voice_eo_19841710.wav,eo,m
248,507092,train,devas,eo/clips/devas/common_voice_eo_19308747.wav,eo,m
...,...,...,...,...,...,...
238,684977,train,белән,tt/clips/белән/common_voice_tt_18879708__2.wav,tt,m
344,677446,train,кайтып,tt/clips/кайтып/common_voice_tt_17786939.wav,tt,m
292,670351,train,тиеш,tt/clips/тиеш/common_voice_tt_17359125.wav,tt,m
399,685452,train,хәзер,tt/clips/хәзер/common_voice_tt_17523221.wav,tt,m


In [86]:
df = test_dfs[0]
df = df[df['mode']=='train']
print(len(df['label'].unique()))
print(len(df[df['label'] == 'každý']))

165
100
