In [1]:
import torch
import os

device = torch.device("cuda:0") 

In [2]:
torch.cuda.is_available()

True

In [3]:
!ls

CLDR.ipynb		  embeddings.py       Preprocessing.ipynb  start_env.sh
config_cpu.sh		  mapping_methods.py  __pycache__	   Thesis
config_gpu.sh		  nltk_data	      slurm-2254428.out    virtual_env
doc2vec_embeddings.ipynb  plots		      slurm-2257960.out


In [4]:
!mkdir Thesis
%cd Thesis

mkdir: cannot create directory ‘Thesis’: File exists
/home2/s4231317/Thesis


In [5]:
languages = ["en", "ro", "es", "fr", "de", "nl"]

if "Data" not in os.listdir(): # first generate the data locally
  !mkdir Data

  %cd Data

  for lang in languages:
      if lang in os.listdir():
        print(f"{lang} is already present")
        continue
      link = f"https://wt-public.emm4u.eu/Acquis/JRC-Acquis.3.0/corpus/jrc-{lang}.tgz"
      !wget $link
      file_name = f"jrc-{lang}.tgz"
      !tar xzf $file_name
      !rm jrc*
      %cd $lang
      path = f"jrc-{lang}.xml"
      !rm $path
      path = f"jrc-{lang}.xml~"
      !rm $path
      path = f"jrcHeader-{lang}.html"
      !rm $path
      path = f"jrcHeader-{lang}.xml"
      !rm $path
      path = f"tei2-ext.dtd"
      !rm $path
      path = f"tei2.dtd"
      !rm $path
      %cd ..
else:
  %cd Data
  print("All data are downloaded")
    

/home2/s4231317/Thesis/Data
All data are downloaded


In [6]:
import re
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ElementTree
from html.entities import html5
import pandas as pd
from tqdm import tqdm

def read_convert_whole(path, lang):
  c=[]
  for folder in os.listdir(path):
    for filename in tqdm(os.listdir(path+"/"+folder), desc=f"{lang}-{folder}"):
        tree = ET.parse(path+"/"+folder+"/"+filename)
        root = tree.getroot()
        number = root.attrib["n"]
        body = ""
        
        for text in root[1].itertext():
          text = re.sub(r"[\r\n\t\f\v]+", "", text) # remove whitespace characters except " "
          text = re.sub(r"[ ]{2,}", " ", text) # reduce consecutive spaces to singular spaces
            
          bad_format = re.findall(r"(%[a-zA-z]+%)", text) # e.g. %eacute%
          if len(bad_format) > 1:
            for symbol in bad_format:
              symbol = symbol[1:-1] + ";"
              if symbol in html5.keys():
                text = re.sub(r"(%[a-zA-z]+%)", html5[symbol], text, 1)
                
          body += text
        c.append([number, body]) #,title,body])
  df = pd.DataFrame (c,columns=['number', f"body_{lang}"]) #,'title','body']) 
  return df

In [7]:
!ls

data_merged.csv  de  en  es  fr  nl  ro  tok_data.pickle


In [8]:
from functools import reduce

if "data_merged.csv" not in os.listdir():
  dfs = []
  cols = ["number"]
  for lang in languages:
      dfs.append(read_convert_whole(lang, lang))
      cols.append(f"body_{lang}")

  df_merged = reduce(lambda left,right:pd.merge(left,right, on=["number"]), dfs)

  df_merged.to_csv("data_merged.csv", index=False)
else:
  print("Data were downloaded - loading")
  df_merged = pd.read_csv("data_merged.csv")

Data were downloaded - loading


In [9]:
!ls

data_merged.csv  de  en  es  fr  nl  ro  tok_data.pickle


In [10]:
df_merged["body_de"][0]

'  Verordnung (EWG) Nr. 2264/69 der Kommission vom 13. November 1969 über die Anträge auf Rückvergütung der den 0rganisationen von Obst- und Gemüseerzeugern von den Mitgliedstaaten gewährten Beihilfen  VERORDNUNG (EWG) Nr. 2264/69 DER KOMMISSION vom 13. November 1969 über die Anträge auf Rückvergütung der den Organisationen von Obst- und Gemüseerzeugern von den Mitgliedstaaten gewährten Beihilfen DIE KOMMISSION DER EUROPÄISCHEN GEMEINSCHAFTEN - gestützt auf den Vertrag zur Gründung der Europäischen Wirtschaftsgemeinschaft, gestützt auf die Verordnung Nr. 159/66/EWG des Rates vom 25. Oktober 1966 mit zusätzlichen Vorschriften für die gemeinsame Marktorganisation für Obst und Gemüse (1), gestützt auf die Verordnung (EWG) Nr. 499/69 des Rates vom 11. März 1969 über die Rückvergütung der den Organisationen von Obst- und Gemüseerzeugern von den Mitgliedstaaten gewährten Beihilfen (2), insbesondere auf Artikel 7 Absatz 3, und in Erwägung nachstehender Gründe: Die Anträge auf Rückvergütung de

In [11]:
df_merged.shape

(6487, 7)

In [12]:
import torch

device = torch.device("cuda:0") 

# "facebook/xlm-v-base", "xlm-roberta-base", "google/mt5-base", "susnato/ernie-m-base_pytorch", "bert-base-multilingual-cased", 
model_names = ["bert-base-multilingual-uncased", "xlm-roberta-base", "google/mt5-base", "susnato/ernie-m-base_pytorch"]

%cd ~/Thesis

if "Models" not in os.listdir():
  !mkdir Models

%cd Models  
!git lfs install
for model_name in model_names:
  aux = model_name.split("/")
  aux = aux[1] if len(aux) > 1 else aux[0]
  if aux in os.listdir():
    print(f"{aux} has already been downloaded")
    continue
  !git clone https://huggingface.co/$model_name
    
%cd ..


/home2/s4231317/Thesis
/home2/s4231317/Thesis/Models
Git LFS initialized.
bert-base-multilingual-uncased has already been downloaded
xlm-roberta-base has already been downloaded
mt5-base has already been downloaded
ernie-m-base_pytorch has already been downloaded
/home2/s4231317/Thesis


In [13]:
!ls

Data  Embeddings  Models  thesis_code


In [18]:
# models = ["bert-base-multilingual-uncased", "mt5-base", "ernie-m-base_pytorch", "xlm-roberta-base"]
models = ["bert-base-multilingual-uncased", "mt5-base", "xlm-roberta-base"]
languages = ["en", "fr"]

In [19]:
import torch.multiprocessing as mp
from embeddings import get_embeddings

def get_embeddings_loop(lang, data, folder_name): 
  processes = []
  path = f"Embeddings/{folder_name}/"
  for model_name in models:
      if f"emb_{model_name}_{lang}.npy" in os.listdir(path):
        print(f"{model_name} for {lang} was already used to create embeddings, going to next model")
        continue

      p = mp.Process(target=get_embeddings, args=(data, lang, model_name, path))
      p.start()
      processes.append(p)

  for p in processes:
    p.join()

In [None]:
sample_max = 7000

%cd ~/Thesis

!mkdir Embeddings

if __name__ == "__main__":
  for lang in languages:
    mp.set_start_method('spawn', force=True)
    folder_name = f"{lang}"
    !mkdir Embeddings/$folder_name

    get_embeddings_loop(lang, df_merged[f"body_{lang}"][:sample_max], folder_name)