In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [6]:
from datasets import load_dataset
from bs4 import BeautifulSoup
import requests

def glosbe_romani_scraper(romanian_data_output_path, romani_data_output_path, url):
    # This site helps with different dialects of romani, and so we used balkan and carpathian as it provides phrases as examples
    # Note: Some samples were added manually due to some problems with scraping

    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    romanian_phrases = soup.find_all('div', attrs = {'class':'w-1/2 dir-aware-pr-1'})
    carpathian_romani_phrases = soup.find_all('div', attrs = {'class':'w-1/2 dir-aware-pl-1'})

    file_writer_romanian, file_writer_romani = open(romanian_data_output_path, 'a'), open(romani_data_output_path, 'a')

    for romanian_phrase, romani_phrase in zip(romanian_phrases, carpathian_romani_phrases):
      ro_phrase_text, roma_phrase_text = romanian_phrase.get_text().strip(), romani_phrase.get_text().strip()

      if ro_phrase_text and roma_phrase_text:
        file_writer_romani.write(roma_phrase_text + '\n')
        file_writer_romanian.write(ro_phrase_text + '\n')




def get_manually_added_phrases(path_manually_added_phrases, romanian_data_output_path, romani_data_output_path):
  # This methods gets the manually added samples that couldn't be loaded with the scraper

  with open(path_manually_added_phrases, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    ro_list = []
    romani_list = []
    for i in range(len(lines)):
        if len(lines[i]) == 1:
            ro_phrase = lines[i + 1].strip()
            romani_phrase = lines[i+2].strip()
            if ro_phrase not in ro_list:
                romani_list.append(romani_phrase)
                ro_list.append(ro_phrase)

        elif i == 0:
            romani_phrase = lines[i + 1].strip()
            ro_phrase = lines[i].strip()
            if ro_phrase not in ro_list:
                ro_list.append(ro_phrase)
                romani_list.append(romani_phrase)

  with open(romanian_data_output_path, 'w', encoding='utf-8') as ro_file, open(romani_data_output_path, 'w', encoding='utf-8') as roma_file:
      for ro_sample, roma_sample in zip(ro_list, romani_list):
          ro_file.write(ro_sample.strip() + '\n')
          roma_file.write(roma_sample.strip() + '\n')


def get_samples_from_dataset(dataset_name, romanian_data_output_path, romani_data_output_path):
  # This method gets samples from a huggingface dataset containing romanian and romani samples

  with open(romanian_data_output_path, 'a') as ro_file, open(romani_data_output_path, 'a') as roma_file:

    dataset = load_dataset(dataset_name, lang1 = "ro", lang2 = "rom")
    for sample in dataset['train']['translation']:
      ro_sample, roma_sample = sample['ro'], sample['rom']

      if ro_sample and roma_sample:
        ro_file.write(ro_sample + '\n')
        roma_file.write(roma_sample + '\n')



def get_phrases_from_dictionary_course(path_dict, romanian_data_output_path, romani_data_output_path):
    # This method gets some samples from a romanian - kalderash romani dictionary and a romani course that we found online

    dict_file = open(path_dict, 'r')
    file_writer_romanian, file_writer_romani = open(romanian_data_output_path, 'a'), open(romani_data_output_path, 'a')
    for sample in dict_file:
      phrases = sample.split(':')
      romani_phrase, romanian_phrase = phrases[0], phrases[1].strip()

      file_writer_romani.write(romani_phrase + '\n')
      file_writer_romanian.write(romanian_phrase + '\n')




In [7]:
carpathian_romani_urls = [
    'https://ro.glosbe.com/ro/rmc/ce%20faci',
    'https://ro.glosbe.com/ro/rmc/minte',
    'https://ro.glosbe.com/ro/rmc/c%C3%A2t',
    'https://ro.glosbe.com/ro/rmc/bun',
    'https://ro.glosbe.com/ro/rmc/care',
    'https://ro.glosbe.com/ro/rmc/tu',
    'https://ro.glosbe.com/ro/rmc/deja',
    'https://ro.glosbe.com/ro/rmc/spus',
    'https://glosbe.com/ro/rmc/pace',
    'https://glosbe.com/ro/rmc/zi',
    'https://glosbe.com/ro/rmc/iubir',
    'https://ro.glosbe.com/ro/rmc/iubi',
    'https://glosbe.com/ro/rmc/%C3%AEnceput',
    'https://glosbe.com/ro/rmc/cine',
    'https://glosbe.com/ro/rmc/poate',
    'https://glosbe.com/ro/rmc/chiar',
    'https://glosbe.com/ro/rmc/atunci',
    'https://ro.glosbe.com/ro/rmc/unde',
    'https://ro.glosbe.com/ro/rmc/voi',
    'https://ro.glosbe.com/ro/rmc/noi',
    'https://ro.glosbe.com/ro/rmc/eu',
    'https://ro.glosbe.com/ro/rmc/el',
    'https://ro.glosbe.com/ro/rmc/ea',
    'https://ro.glosbe.com/ro/rmc/a%C8%99a',
    'https://ro.glosbe.com/ro/rmc/avut',
    'https://ro.glosbe.com/ro/rmc/azi',
    'https://ro.glosbe.com/ro/rmc/so%C5%A3',
    'https://ro.glosbe.com/ro/rmc/nu',
    'https://ro.glosbe.com/ro/rmc/da',
    'https://glosbe.com/ro/rmc/David',
    'https://glosbe.com/ro/rmc/un',
    'https://glosbe.com/ro/rmc/poate',
    'https://glosbe.com/ro/rmc/mai',
    'https://glosbe.com/ro/rmc/%C3%AEnc%C3%A2t',
    'https://glosbe.com/ro/rmc/pentru',
    'https://glosbe.com/ro/rmc/de%20la',
    'https://glosbe.com/ro/rmc/acum',
    'https://glosbe.com/ro/rmc/pot',
    'https://glosbe.com/ro/rmc/mic',
    'https://glosbe.com/ro/rmc/om',
    'https://ro.glosbe.com/ro/rmc/apoi'

]

get_manually_added_phrases('manually_added_phrases.txt', 'romanian_data.txt', 'romani_data.txt')

for url in carpathian_romani_urls:
  glosbe_romani_scraper('romanian_data.txt', 'romani_data.txt', url)

get_phrases_from_dictionary_course('romanian-romani_dictionary_course_phrases.txt', 'romanian_data.txt', 'romani_data.txt')
get_samples_from_dataset('bible_para', 'romanian_data.txt', 'romani_data.txt')

In [10]:
# sanity checking for duplicates

non_dup = []
f_romani_clean, f_romanian_clean = open('romani.txt', 'w'), open('romanian.txt', 'w')
f_romani, f_romanian = open('romani_data.txt', 'r'), open('romanian_data.txt', 'r')

for sample_ro, sample_romani in zip(f_romanian, f_romani):
  if sample_ro not in non_dup and sample_romani not in non_dup:
    f_romani_clean.write(sample_romani)
    f_romanian_clean.write(sample_ro)
    non_dup.append(sample_ro)

f_romani_clean.close(), f_romanian_clean.close()

(None, None)