In [177]:
from bs4 import BeautifulSoup
import requests
import re
from deep_translator import GoogleTranslator
import sentencepiece as spm
import json
import pandas as pd
from transformers import MarianTokenizer, MarianMTModel, MarianConfig
from datasets import Dataset

Scraping the data from different websites/dictionaries/romani courses

In [181]:
class Scraper:
  # This scraper will help us gather romani texts from certain websites

  def __init__(self):
    self.romani_data_output_path = 'romani.txt'
    self.romanian_data_output_path = 'romanian.txt'

  def wikipedia_scraper(self, url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    forbidden_pattern_list = [r'<a\s+href="/wiki/[^"]+:[^"]+"', r'<a\s+href="http[^"]*"', r'<a\s+href="\/w\/index\.php\?[^"]*"[^>]*>.*?<\/a>']
    references = soup.find_all('a', href = True, attrs = {'class':'', 'span':'', 'accesskey':'', 'data-mw':'', 'dir':'', 'aria-label':''})
    romani_phrases_list = [] # This list will help us with checking the duplicates, as this method can store multiple instances of the same phrase

    file_writer = open(self.romani_data_output_path, 'a')

    for ref in references:
      if not any(re.search(pattern, str(ref)) for pattern in forbidden_pattern_list):
        if ref.get_text() not in romani_phrases_list:
          romani_phrases_list.append(ref.get_text())
          file_writer.write(ref.get_text()), file_writer.write('\n')



  def tumblr_scraper(self, url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    phrases = soup.find_all('p')
    auxiliary_phrase_list = [phrase.get_text() for phrase in phrases] # This list helps us store all the phrases from the website

    file_writer = open(self.romani_data_output_path, 'a')
    
    # Now we will only select the romani phrases
    for phrase in auxiliary_phrase_list[1:len(auxiliary_phrase_list) - 1]:
      romani_phrase = phrase.split('\n')
      file_writer.write(romani_phrase[0]), file_writer.write('\n')

  def glosbe_romani_scraper(self, url):
    # This site helps with different dialects of romani, and so we used balkan and carpathian as it provides phrases as examples
    # Note: Some samples were added manually due to some problems with scraping

    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    romanian_phrases = soup.find_all('div', attrs = {'class':'w-1/2 dir-aware-pr-1'})
    carpathian_romani_phrases = soup.find_all('div', attrs = {'class':'w-1/2 dir-aware-pl-1'})

    file_writer_romanian, file_writer_romani = open(self.romanian_data_output_path, 'a'), open(self.romani_data_output_path, 'a')

    for romanian_phrase, romani_phrase in zip(romanian_phrases, carpathian_romani_phrases):
      ro_phrase_text, roma_phrase_text = romanian_phrase.get_text().strip(), romani_phrase.get_text().strip()

      if ro_phrase_text and roma_phrase_text:
        file_writer_romani.write(roma_phrase_text), file_writer_romani.write('\n')
        file_writer_romanian.write(ro_phrase_text), file_writer_romanian.write('\n')

  def get_phrases_from_dictionary_course(self, path_dict):
    # This method gets some samples from a romanian - kalderash romani dictionary and a romani course that we found online
    
    dict_file = open(path_dict, 'r')
    file_writer_romanian, file_writer_romani = open(self.romanian_data_output_path, 'a'), open(self.romani_data_output_path, 'a')
    for sample in dict_file:
      phrases = sample.split(':')
      romani_phrase, romanian_phrase = phrases[0], phrases[1].strip()
      
      file_writer_romani.write(romani_phrase), file_writer_romani.write('\n')
      file_writer_romanian.write(romanian_phrase), file_writer_romanian.write('\n')


This part deals with data augmentation: a first augmentation technique was back translation. The Romanian sentences were translated to English and then the English equivalent was translated back into Romanian, while the Romani samples were kept the same.
The second augmentation technique was sentence insertion (beginning and end). Precisely: we introduced a token at the begining and at the end of each romanian and romani sentence.

In [None]:
# Data augmentation (removed random swap due to the semantic ambiguity that it generates)

class DataAugmenter:
  def __init__(self):
    self.romani_data_path = 'romani.txt'
    self.romanian_data_path = 'romanian.txt'
  
  def back_translation(self):
    with open(self.romanian_data_path, 'r') as romanian_samples_file, open(self.romani_data_path, 'r') as romani_samples_file:
      romanian_samples = romanian_samples_file.readlines()
      romani_samples = romani_samples_file.readlines()
    
    romanian_file_writer, romani_file_writer = open(self.romanian_data_path, 'a'), open(self.romani_data_path, 'a')
    romanian_file_writer.write('\n'), romani_file_writer.write('\n')

    for sample_ro, sample_romani in zip(romanian_samples, romani_samples):
      english_equivalent = GoogleTranslator(source='ro', target='en').translate(sample_ro)
      back_translated = GoogleTranslator(source='en', target='ro').translate(english_equivalent)
      romanian_file_writer.write(back_translated), romanian_file_writer.write('\n')
      romani_file_writer.write(sample_romani.strip()), romani_file_writer.write('\n')
  
  def sentence_insertion(self):
    begining_token_romani = 'Po del chavo - '
    begining_token_romanian = 'Începutul propoziției - '

    with open(self.romanian_data_path, 'r') as romanian_samples_file, open(self.romani_data_path, 'r') as romani_samples_file:
      romanian_samples = romanian_samples_file.readlines()
      romani_samples = romani_samples_file.readlines()
    
    romanian_file_writer, romani_file_writer = open(self.romanian_data_path, 'a'), open(self.romani_data_path, 'a')
    romanian_file_writer.write('\n'), romani_file_writer.write('\n')

    for sample_ro, sample_romani in zip(romanian_samples, romani_samples):
      new_romanian_sentence = begining_token_romanian + sample_ro
      new_romani_sentence = begining_token_romani + sample_romani

      romanian_file_writer.write(new_romanian_sentence.strip()), romanian_file_writer.write('\n')
      romani_file_writer.write(new_romani_sentence.strip()), romani_file_writer.write('\n')

Next step was to create a tokenizer. We decided to use a MarianTokenizer from HelsinkiNLP as it has support for romance languages.

In [None]:
# Creating a tokenizer and a vocabulary
def create_tokenizer(corpus_path, tokenizer_model_prefix):
  spm.SentencePieceTrainer.train(
      input = corpus_path,
      model_prefix = tokenizer_model_prefix,
      vocab_size = 2500, # increase size if you augment the data
      pad_id=0,
      unk_id=1,
      bos_id=2,
      eos_id=3
  )

def create_vocabulary(tok_prefix_source, tok_prefix_target):
  sp_processor_source = spm.SentencePieceProcessor()
  sp_processor_source.load(tok_prefix_source + '.model')

  sp_processor_target = spm.SentencePieceProcessor()
  sp_processor_target.load(tok_prefix_target + '.model')

  vocab_source, vocab_target = {}, {}

  for id in range(sp_processor_source.get_piece_size()):
    token_source = sp_processor_source.id_to_piece(id)
    vocab_source[token_source] = id

    token_target = sp_processor_target.id_to_piece(id)
    vocab_target[token_target] = id
  
  return vocab_source, vocab_target

def combine_vocabularies(vocab_source, vocab_target):
  combined_vocab = {}
  output_file = 'vocab.json'

  for idx, (token, _) in enumerate(vocab_target.items()):
    combined_vocab[token] = int(idx)
  
  idx_start_source_vocab, counter_non_duplicates = len(combined_vocab), 0 # keeping count of non duplicate items in the dictionary and also maintaining their index

  for idx, (token, _) in enumerate(vocab_source.items()):
    if token not in combined_vocab:
      combined_vocab[token] = idx_start_source_vocab + counter_non_duplicates
      
      counter_non_duplicates += 1

  with open(output_file, 'w') as vocab_file:
    json.dump(combined_vocab, vocab_file, indent=4)
  
create_tokenizer('romani.txt', 'target')
create_tokenizer('romanian.txt', 'source')

vocab_source, vocab_target = create_vocabulary('source', 'target')
combine_vocabularies(vocab_source, vocab_target)


Obtaining the train, test sets and tokenizing them 

In [None]:
USE_PRETRAINED = False

# Loading the data
def load_parallel_data(romanian_path, romani_path):
    with open(romanian_path, 'r', encoding='utf-8') as ro_file, \
         open(romani_path, 'r', encoding='utf-8') as roma_file:
        romanian_lines = ro_file.readlines()
        romani_lines = roma_file.readlines()

    assert len(romanian_lines) == len(romani_lines), "Mismatched number of lines!"

    data = {"translation": [{"ro": ro.strip(), "roma": roma.strip()}
                              for ro, roma in zip(romanian_lines, romani_lines)]}
    return Dataset.from_dict(data)

huggingface_dataset = load_parallel_data('romanian.txt', 'romani.txt')

tokenizer = MarianTokenizer.from_pretrained('romanian_romani_tokenizer')

# Tokenizing the whole dataset
def preprocess_function(examples):
    inputs = [example["ro"] for example in examples["translation"]]
    targets = [example["roma"] for example in examples["translation"]]

    model_inputs = tokenizer(inputs, text_target = targets, max_length=128, truncation=True, padding="max_length")
    
    return model_inputs

train_test_split = huggingface_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

Preparing the model

In [None]:
if USE_PRETRAINED:
  model_to_train = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-roa-en')
  tokenizer = MarianTokenizer.from_pretrained(model_to_train) 

else:
  trained_model_config = MarianConfig()
  model_to_train = MarianMTModel(config = trained_model_config)