In [30]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset, DatasetDict
from collections import Counter
from wordcloud import WordCloud
from unidecode import unidecode

In [2]:
data = load_dataset("qanastek/MASSIVE", trust_remote_code=True)
print(data)

DatasetDict({
    train: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 587214
    })
    validation: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 103683
    })
    test: Dataset({
        features: ['id', 'locale', 'partition', 'scenario', 'intent', 'utt', 'annot_utt', 'tokens', 'ner_tags', 'worker_id', 'slot_method', 'judgments'],
        num_rows: 151674
    })
})


In [28]:
des_col = ['locale', 'partition', 'utt', 'tokens']

data1 = DatasetDict({
    'train': data['train'].select_columns(des_col),
    'validation': data['validation'].select_columns(des_col),
    'test': data['test'].select_columns(des_col)
})

print(data1)

DatasetDict({
    train: Dataset({
        features: ['locale', 'partition', 'utt', 'tokens'],
        num_rows: 587214
    })
    validation: Dataset({
        features: ['locale', 'partition', 'utt', 'tokens'],
        num_rows: 103683
    })
    test: Dataset({
        features: ['locale', 'partition', 'utt', 'tokens'],
        num_rows: 151674
    })
})


In [None]:
train_data = data1['train']
val_data = data1['validation']
test_data = data1['test']

In [32]:
locales = ['af-ZA', 'da-DK', 'de-DE', 'en-US', 'es-ES', 'fr-FR', 'fi-FI', 'hu-HU', 'is-IS', 
           'it-IT', 'jv-ID', 'lv-LV', 'ms-MY', 'nb-NO', 'nl-NL', 'pl-PL', 'pt-PT', 
           'ro-RO', 'ru-RU', 'sl-SL', 'sv-SE', 'sq-AL', 'sw-KE', 'tl-PH', 'tr-TR', 
           'vi-VN', 'cy-GB']

# Directory to store output files
output_dir = "utts_by_locale"
os.makedirs(output_dir, exist_ok=True)

def extr_utt(dataset, locales, output_dir):
    for locale in locales:
        locale_data = dataset.filter(lambda example: example['locale'] == locale)
        
        file_path = os.path.join(output_dir, f"{locale}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            for utt in locale_data['utt']:
                file.write(utt + "\n")                            # 1 utterance per line

        print(f"Saved {locale} utterances to {file_path}")

In [33]:
extr_utt(data1['train'], locales, output_dir)

Filter: 100%|██████████| 587214/587214 [00:12<00:00, 48297.02 examples/s]


Saved af-ZA utterances to utts_by_locale\af-ZA.txt


Filter: 100%|██████████| 587214/587214 [00:15<00:00, 37591.11 examples/s]


Saved da-DK utterances to utts_by_locale\da-DK.txt


Filter: 100%|██████████| 587214/587214 [00:16<00:00, 35147.14 examples/s]


Saved de-DE utterances to utts_by_locale\de-DE.txt


Filter: 100%|██████████| 587214/587214 [00:13<00:00, 44023.95 examples/s]


Saved en-US utterances to utts_by_locale\en-US.txt


Filter: 100%|██████████| 587214/587214 [00:14<00:00, 39370.78 examples/s]


Saved es-ES utterances to utts_by_locale\es-ES.txt


Filter: 100%|██████████| 587214/587214 [00:16<00:00, 34764.30 examples/s]


Saved fr-FR utterances to utts_by_locale\fr-FR.txt


Filter: 100%|██████████| 587214/587214 [00:15<00:00, 37846.32 examples/s]


Saved fi-FI utterances to utts_by_locale\fi-FI.txt


Filter: 100%|██████████| 587214/587214 [00:17<00:00, 33950.14 examples/s]


Saved hu-HU utterances to utts_by_locale\hu-HU.txt


Filter: 100%|██████████| 587214/587214 [00:25<00:00, 23302.95 examples/s]


Saved is-IS utterances to utts_by_locale\is-IS.txt


Filter: 100%|██████████| 587214/587214 [00:24<00:00, 23996.33 examples/s]


Saved it-IT utterances to utts_by_locale\it-IT.txt


Filter: 100%|██████████| 587214/587214 [00:14<00:00, 39283.68 examples/s]


Saved jv-ID utterances to utts_by_locale\jv-ID.txt


Filter: 100%|██████████| 587214/587214 [00:27<00:00, 21700.43 examples/s]


Saved lv-LV utterances to utts_by_locale\lv-LV.txt


Filter: 100%|██████████| 587214/587214 [00:20<00:00, 29032.81 examples/s]


Saved ms-MY utterances to utts_by_locale\ms-MY.txt


Filter: 100%|██████████| 587214/587214 [00:08<00:00, 66562.84 examples/s]


Saved nb-NO utterances to utts_by_locale\nb-NO.txt


Filter: 100%|██████████| 587214/587214 [00:08<00:00, 65499.93 examples/s]


Saved nl-NL utterances to utts_by_locale\nl-NL.txt


Filter: 100%|██████████| 587214/587214 [00:16<00:00, 36565.38 examples/s]


Saved pl-PL utterances to utts_by_locale\pl-PL.txt


Filter: 100%|██████████| 587214/587214 [00:16<00:00, 35288.39 examples/s]


Saved pt-PT utterances to utts_by_locale\pt-PT.txt


Filter: 100%|██████████| 587214/587214 [00:19<00:00, 30207.35 examples/s]


Saved ro-RO utterances to utts_by_locale\ro-RO.txt


Filter: 100%|██████████| 587214/587214 [00:15<00:00, 38157.59 examples/s]


Saved ru-RU utterances to utts_by_locale\ru-RU.txt


Filter: 100%|██████████| 587214/587214 [00:18<00:00, 32170.66 examples/s]


Saved sl-SL utterances to utts_by_locale\sl-SL.txt


Filter: 100%|██████████| 587214/587214 [00:17<00:00, 34273.29 examples/s]


Saved sv-SE utterances to utts_by_locale\sv-SE.txt


Filter: 100%|██████████| 587214/587214 [00:15<00:00, 37995.53 examples/s]


Saved sq-AL utterances to utts_by_locale\sq-AL.txt


Filter: 100%|██████████| 587214/587214 [00:15<00:00, 37817.85 examples/s]


Saved sw-KE utterances to utts_by_locale\sw-KE.txt


Filter: 100%|██████████| 587214/587214 [00:12<00:00, 47395.41 examples/s]


Saved tl-PH utterances to utts_by_locale\tl-PH.txt


Filter: 100%|██████████| 587214/587214 [00:14<00:00, 41423.17 examples/s]


Saved tr-TR utterances to utts_by_locale\tr-TR.txt


Filter: 100%|██████████| 587214/587214 [00:18<00:00, 32533.02 examples/s]


Saved vi-VN utterances to utts_by_locale\vi-VN.txt


Filter: 100%|██████████| 587214/587214 [00:17<00:00, 32844.89 examples/s]


Saved cy-GB utterances to utts_by_locale\cy-GB.txt
