In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil
from tqdm import tqdm, trange

In [21]:
TRAIN_CSV = 'dataset/mswc21/csv/TRAIN.csv'
VAL_CSV = 'dataset/mswc21/csv/VAL.csv'
PRE_CSVS = 'dataset/mswc21/experiments/kmeans_multilingual_128'
languages = ['cs', 'uk', 'id', 'et']

train_data = pd.read_csv(TRAIN_CSV, delimiter=',')
val_data = pd.read_csv(VAL_CSV, delimiter=',')

# train_data = train_data[train_data['language'].isin(languages)]
# val_data = val_data[val_data['language'].isin(languages)]

In [22]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
0,25890,train,этого,ru/clips/этого/common_voice_ru_18956507.wav,ru,m
1,61066,train,резолюции,ru/clips/резолюции/common_voice_ru_19157550.wav,ru,m
2,48765,train,менее,ru/clips/менее/common_voice_ru_19288549.wav,ru,m
3,654917,train,kunnen,nl/clips/kunnen/common_voice_nl_22221094.wav,nl,m
4,611222,val,door,nl/clips/door/common_voice_nl_17709306.wav,nl,m


In [23]:
val_data.head()

Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
0,31559,train,někdo,cs/clips/někdo/common_voice_cs_21312096.wav,cs,m
1,4261,val,може,uk/clips/може/common_voice_uk_21567719.wav,uk,n
2,14527,train,gvern,mt/clips/gvern/common_voice_mt_21172642.wav,mt,m
3,82364,train,jepang,id/clips/jepang/common_voice_id_19802457.wav,id,m
4,2294,train,untuk,id/clips/untuk/common_voice_id_19192652.wav,id,m


In [24]:
counts = val_data['label'].value_counts()
# big_labels = 
counts

saya       1000
inte       1000
jsem       1000
questa     1000
buvo        987
           ... 
meitsje     100
saat        100
võiks       100
прошу       100
чому        100
Name: label, Length: 510, dtype: int64

In [25]:
def print_stats(data):
    print(f'len: {len(data)}')
    grouped = data.groupby(['label', 'mode']).count()
    print(grouped)

In [26]:
print_stats(train_data)

len: 688524
              Unnamed: 0  path  language  gender
label  mode                                     
aantal train          88    88        88      88
       val            24    24        24      24
acho   train         151   151       151     151
       val            38    38        38      38
acht   train         137   137       137     137
...                  ...   ...       ...     ...
әллә   val            19    19        19      19
әмма   train          82    82        82      82
       val            20    20        20      20
өчен   train         402   402       402     402
       val            71    71        71      71

[4124 rows x 4 columns]


In [27]:
print_stats(val_data)

len: 101654
               Unnamed: 0  path  language  gender
label   mode                                     
aasta   train         118   118       118     118
        val            30    30        30      30
aceasta train         148   148       148     148
        val            37    37        37      37
această train         191   191       191     191
...                   ...   ...       ...     ...
فضلك    val            42    42        42      42
ماذا    train         119   119       119     119
        val            30    30        30      30
هناك    train         171   171       171     171
        val            44    44        44      44

[1020 rows x 4 columns]


In [28]:
label_data = val_data[val_data['language'].isin(languages)]
print(len(label_data['label'].unique()))

237


In [29]:
np.random.seed(29)
val_labels = np.random.choice(val_data['label'].unique(), replace=False, size=100)
# val_labels = 
print(val_labels)

['každý' 'herkes' 'yılında' 'waard' 'buvo' 'sekiz' 'kaks' 'vara' 'тому'
 'takže' 'musí' 'андан' 'kokku' 'чейин' 'алып' 'tema' 'avro' 'lalu' 'bazı'
 'hija' 'ingen' 'nikdy' 'hästi' 'dokuz' 'väga' 'mille' 'wiedział' 'diğer'
 'лише' 'inimesed' 'adalah' 'kolla' 'saat' 'добре' 'něco' 'болчу' 'lucru'
 'yeni' 'täna' 'avem' 'rohkem' 'jestli' 'fakat' 'mida' 'ještě' 'praegu'
 'meta' 'азыр' 'göre' 'ministru' 'dobře' 'життя' 'إنها' 'selle' 'akan'
 'varför' 'banyak' 'اليوم' 'hade' 'acestea' 'kadar' 'βασιλιάς' 'peaks'
 'kabul' 'мене' 'люди' 'рішення' 'sest' 'bunun' 'kosova' 'kogu' 'başladı'
 'olnud' 'bile' 'менин' 'datang' 'ilgili' 'тобто' 'türkiye' 'эмес' 'enne'
 'jier' 'βασιλόπουλο' 'элек' 'metu' 'bych' 'tomu' 'böyle' 'inte' 'дуже'
 'sona' 'sõnul' 'önemli' 'biri' 'право' 'jekk' 'дагы' 'україні' 'його'
 'mhux']


In [30]:
def split_target(data, count, total_count):
    shuffled = data.sample(frac=1)
    target_df = (pd.concat([shuffled.iloc[:count, :]] * ((total_count + count - 1) // count), ignore_index=True)).iloc[:total_count, :]
    target_val_df = shuffled.iloc[count:, :]
    target_df['mode'] = 'train'
    target_val_df['mode'] = 'val'
    return target_df, target_val_df

def create_random_test(train_data, val_data, target_data, non_target_val_data, target, target_base_count, target_total_count, non_target_train_count):
    targets_train, targets_val = split_target(target_data, target_base_count, target_total_count)
    non_target_train = train_data.sample(non_target_train_count)
    total_df = pd.concat([targets_train, targets_val, non_target_train, non_target_val_data], ignore_index=True).sample(frac=1)
    return total_df

In [31]:
def create_tests(train_data, val_data, val_labels, target_base_count, target_total_count, non_target_train_count, val_count, exp_count, pre_trains_dir):
    np.random.seed(29)
    train_non_target_val = train_data[train_data['mode']=='val'].sample(n=val_count // 2)
    train_train_data = train_data[train_data['mode'] == 'train']
    all_targets = val_data[val_data['label'].isin(val_labels)]
    val_data_clean = val_data[~val_data['label'].isin(val_labels)]
    val_non_target_val = val_data_clean[val_data_clean['mode']=='val'].sample(n=val_count//2)
    all_val = pd.concat([train_non_target_val, val_non_target_val], ignore_index=True).sample(frac=1)
    result = {x: list() for x in val_labels}
    target_dfs = {x: all_targets[all_targets['label'] == x] for x in val_labels}
    for i in trange(exp_count):
        if not pre_trains_dir:
            non_target_train = train_train_data.sample(n=non_target_train_count)
        else:
            non_target_train = pd.read_csv(os.path.join(pre_trains_dir, f'{i}.csv'), delimiter=',')
        for label in tqdm(val_labels):
            targets_train, targets_val = split_target(target_dfs[label], target_base_count, target_total_count)
            total_df = pd.concat([targets_train, targets_val, non_target_train, all_val], ignore_index=True).sample(frac=1)
            result[label].append(total_df)
    return result

In [32]:
pd.options.mode.chained_assignment = None 
dfs = create_tests(train_data, val_data, val_labels, 10, 100, 200, 10000, 10, PRE_CSVS)

  0%|                                                    | 0/10 [00:00<?, ?it/s]
  0%|                                                   | 0/100 [00:00<?, ?it/s][A
 20%|████████▏                                | 20/100 [00:00<00:00, 197.08it/s][A
 40%|████████████████▍                        | 40/100 [00:00<00:00, 163.33it/s][A
 61%|█████████████████████████                | 61/100 [00:00<00:00, 180.38it/s][A
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 191.28it/s][A
 10%|████▍                                       | 1/10 [00:00<00:04,  1.88it/s]
  0%|                                                   | 0/100 [00:00<?, ?it/s][A
 21%|████████▌                                | 21/100 [00:00<00:00, 202.56it/s][A
 42%|█████████████████▏                       | 42/100 [00:00<00:00, 195.03it/s][A
 67%|███████████████████████████▍             | 67/100 [00:00<00:00, 216.42it/s][A
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 212.39it/s][

In [34]:
test = dfs['дуже'][0]
print(len(test))
print(test[test['mode']=='train'])
print(test[test['mode']=='train'][test['label']=='дуже'])

10505
     Unnamed: 0   mode   label                                         path  \
439      231669  train   булып   tt/clips/булып/common_voice_tt_17630443.pt   
14        88110  train    дуже   uk/clips/дуже/common_voice_uk_21361279.wav   
63        39002  train    дуже   uk/clips/дуже/common_voice_uk_22047582.wav   
476      162926  train  subite  eo/clips/subite/common_voice_eo_20306930.pt   
65         9650  train    дуже   uk/clips/дуже/common_voice_uk_20935760.wav   
..          ...    ...     ...                                          ...   
392      479454  train    moet    nl/clips/moet/common_voice_nl_18646316.pt   
15         9650  train    дуже   uk/clips/дуже/common_voice_uk_20935760.wav   
22        81991  train    дуже   uk/clips/дуже/common_voice_uk_21650737.wav   
445      386697  train   будет   ru/clips/будет/common_voice_ru_19797248.pt   
27        81622  train    дуже   uk/clips/дуже/common_voice_uk_21487102.wav   

    language gender  
439       tt      m  
1

  print(test[test['mode']=='train'][test['label']=='дуже'])


In [35]:
PATH = 'dataset/mswc21/experiments/kmeans_multilingual_128_cs_uk_id_et' 
os.makedirs(os.path.join(PATH), exist_ok=True)
for label in tqdm(val_labels):
    for i in trange(len(dfs[label])):
        dfs[label][i]['label'] = dfs[label][i]['label'].apply(lambda x: 'target' if x == label else '_unknown')
        dfs[label][i].to_csv(os.path.join(PATH, f'{label}_{i}.csv'))

  0%|                                                   | 0/100 [00:00<?, ?it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 30%|█████████████▏                              | 3/10 [00:00<00:00, 29.27it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.47it/s][A
  1%|▍                                          | 1/100 [00:00<00:29,  3.30it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 30%|█████████████▏                              | 3/10 [00:00<00:00, 25.04it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 29.96it/s][A
  2%|▊                                          | 2/100 [00:00<00:31,  3.09it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 32.69it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 30.99it/s][A
 

  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 32.49it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.86it/s][A
 25%|██████████▌                               | 25/100 [00:07<00:23,  3.17it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 36.10it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 35.77it/s][A
 26%|██████████▉                               | 26/100 [00:08<00:22,  3.27it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 35.93it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 35.70it/s][A
 27%|███████████▎                              | 27/100 [00:08<00:21,  3.34it/s]
 

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 34.46it/s][A
 48%|████████████████████▏                     | 48/100 [00:15<00:16,  3.16it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 36.09it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.68it/s][A
 49%|████████████████████▌                     | 49/100 [00:15<00:15,  3.21it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 30.15it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 32.01it/s][A
 50%|█████████████████████                     | 50/100 [00:15<00:15,  3.20it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 30%|█████████████▏                              | 3/10 [00:00<00:00, 27.96it/s][A
1

  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 35.48it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.70it/s][A
 73%|██████████████████████████████▋           | 73/100 [00:22<00:07,  3.39it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 36.47it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 34.14it/s][A
 74%|███████████████████████████████           | 74/100 [00:22<00:07,  3.39it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 32.31it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 32.88it/s][A
 75%|███████████████████████████████▌          | 75/100 [00:23<00:07,  3.34it/s]
 

  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 34.83it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 35.08it/s][A
 97%|████████████████████████████████████████▋ | 97/100 [00:29<00:00,  3.28it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 36.03it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 34.83it/s][A
 98%|█████████████████████████████████████████▏| 98/100 [00:30<00:00,  3.33it/s]
  0%|                                                    | 0/10 [00:00<?, ?it/s][A
 40%|█████████████████▌                          | 4/10 [00:00<00:00, 33.53it/s][A
100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.30it/s][A
 99%|█████████████████████████████████████████▌| 99/100 [00:30<00:00,  3.32it/s]
 

In [109]:
test_dfs = dfs['každý']
test_0 = test_dfs[1]
test_0 = test_0[test_0['label'] != 'každý']
test_0 = test_0[test_0['mode']=='train']
display(test_0.sort_values(by=['path']))

# test_1 = test_dfs[1]
test_1 = dfs['herkes'][1]
test_1 = test_1[test_1['label'] != 'herkes']
test_1 = test_1[test_1['mode']=='train']
display(test_1.sort_values(by=['path']))

Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
243,501534,train,ankaŭ,eo/clips/ankaŭ/common_voice_eo_20884350.wav,eo,m
237,491123,train,antaŭ,eo/clips/antaŭ/common_voice_eo_18997147.wav,eo,m
280,493159,train,bela,eo/clips/bela/common_voice_eo_20486844.wav,eo,m
258,487469,train,denove,eo/clips/denove/common_voice_eo_19841710.wav,eo,m
263,507092,train,devas,eo/clips/devas/common_voice_eo_19308747.wav,eo,m
...,...,...,...,...,...,...
253,684977,train,белән,tt/clips/белән/common_voice_tt_18879708__2.wav,tt,m
359,677446,train,кайтып,tt/clips/кайтып/common_voice_tt_17786939.wav,tt,m
307,670351,train,тиеш,tt/clips/тиеш/common_voice_tt_17359125.wav,tt,m
414,685452,train,хәзер,tt/clips/хәзер/common_voice_tt_17523221.wav,tt,m


Unnamed: 0.1,Unnamed: 0,mode,label,path,language,gender
228,501534,train,ankaŭ,eo/clips/ankaŭ/common_voice_eo_20884350.wav,eo,m
222,491123,train,antaŭ,eo/clips/antaŭ/common_voice_eo_18997147.wav,eo,m
265,493159,train,bela,eo/clips/bela/common_voice_eo_20486844.wav,eo,m
243,487469,train,denove,eo/clips/denove/common_voice_eo_19841710.wav,eo,m
248,507092,train,devas,eo/clips/devas/common_voice_eo_19308747.wav,eo,m
...,...,...,...,...,...,...
238,684977,train,белән,tt/clips/белән/common_voice_tt_18879708__2.wav,tt,m
344,677446,train,кайтып,tt/clips/кайтып/common_voice_tt_17786939.wav,tt,m
292,670351,train,тиеш,tt/clips/тиеш/common_voice_tt_17359125.wav,tt,m
399,685452,train,хәзер,tt/clips/хәзер/common_voice_tt_17523221.wav,tt,m


In [86]:
df = test_dfs[0]
df = df[df['mode']=='train']
print(len(df['label'].unique()))
print(len(df[df['label'] == 'každý']))

165
100
