In [177]:
from bs4 import BeautifulSoup
import requests
import re

Firstly, we scraped some romani phrases from different websites: wikipedia [here](https://en.wiktionary.org/wiki/Category:Romani_phrasebook) and tumblr [here](https://www.tumblr.com/aj-rromale/4411570949/a-very-random-assortment-of-romani-phrases-and?redirect_to=%2Faj-rromale%2F4411570949%2Fa-very-random-assortment-of-romani-phrases-and&source=blog_view_login_wall) and from glosbe [here](https://glosbe.com/)

In [181]:
class Scraper:
  # This scraper will help us gather romani texts from certain websites

  def __init__(self):
    self.romani_data_output_path = 'romani.txt'
    self.romanian_data_output_path = 'romanian.txt'

  def wikipedia_scraper(self, url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    forbidden_pattern_list = [r'<a\s+href="/wiki/[^"]+:[^"]+"', r'<a\s+href="http[^"]*"', r'<a\s+href="\/w\/index\.php\?[^"]*"[^>]*>.*?<\/a>']
    references = soup.find_all('a', href = True, attrs = {'class':'', 'span':'', 'accesskey':'', 'data-mw':'', 'dir':'', 'aria-label':''})
    romani_phrases_list = [] # This list will help us with checking the duplicates, as this method can store multiple instances of the same phrase

    file_writer = open(self.romani_data_output_path, 'a')

    for ref in references:
      if not any(re.search(pattern, str(ref)) for pattern in forbidden_pattern_list):
        if ref.get_text() not in romani_phrases_list:
          romani_phrases_list.append(ref.get_text())
          file_writer.write(ref.get_text()), file_writer.write('\n')



  def tumblr_scraper(self, url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    phrases = soup.find_all('p')
    auxiliary_phrase_list = [phrase.get_text() for phrase in phrases] # This list helps us store all the phrases from the website

    file_writer = open(self.romani_data_output_path, 'a')
    
    # Now we will only select the romani phrases
    for phrase in auxiliary_phrase_list[1:len(auxiliary_phrase_list) - 1]:
      romani_phrase = phrase.split('\n')
      file_writer.write(romani_phrase[0]), file_writer.write('\n')

  def glosbe_romani_scraper(self, url):
    # This site helps with different dialects of romani, and so we used balkan and carpathian as it provides phrases as examples
    # Note: Some samples were added manually due to some problems with scraping

    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    romanian_phrases = soup.find_all('div', attrs = {'class':'w-1/2 dir-aware-pr-1'})
    carpathian_romani_phrases = soup.find_all('div', attrs = {'class':'w-1/2 dir-aware-pl-1'})

    file_writer_romanian, file_writer_romani = open(self.romanian_data_output_path, 'a'), open(self.romani_data_output_path, 'a')

    for romanian_phrase, romani_phrase in zip(romanian_phrases, carpathian_romani_phrases):
      ro_phrase_text, roma_phrase_text = romanian_phrase.get_text().strip(), romani_phrase.get_text().strip()

      if ro_phrase_text and roma_phrase_text:
        file_writer_romani.write(roma_phrase_text), file_writer_romani.write('\n')
        file_writer_romanian.write(ro_phrase_text), file_writer_romanian.write('\n')

  def get_phrases_from_dictionary(self, path_dict):
    # This method gets some samples from a romanian - kalderash romani dictionary that we found online
    
    dict_file = open(path_dict, 'r')
    file_writer_romanian, file_writer_romani = open(self.romanian_data_output_path, 'a'), open(self.romani_data_output_path, 'a')
    for sample in dict_file:
      phrases = sample.split(':')
      romani_phrase, romanian_phrase = phrases[0], phrases[1].strip()
      
      file_writer_romani.write(romani_phrase), file_writer_romani.write('\n')
      file_writer_romanian.write(romanian_phrase), file_writer_romanian.write('\n')
