**Find this notebook in a script format, mt2magic folder**

In [None]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2


In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import html
import torch

In [None]:
device = torch.device('cuda') # need the gpu to process 100k rows

In [None]:
df = pd.read_csv('it-en.csv', sep=',') #import the csv before running this cell

In [None]:
"""
This open source model should have an accuracy close to 1.00 for language identification.
Moreover we will use a softmax for having a confidence threshold on the predictions
"""
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
softmax = torch.nn.Softmax(dim=-1)

Downloading (…)okenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
model.to(device) 

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0): XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr

**These next two functions are needed to perform language identification. Some sentence have incorrect languages (i.e. labelled italian sentences are instead french sentences etc. etc.)**

In [None]:
"""
Function that returns boolean indexes of a batch of sentences.
True if the language is correct (matches the lan input).
The prediction will be used if its softmax entrance is greater than tol,
a False index will be placed otherwise.
"""
def check_lan_batch(sentences: list, lan: str, model, 
                    tokenizer, tol: float=0.8) -> pd.Series:
    lan_id = model.config.label2id[lan]
    with torch.no_grad():
      tokenized_sentences = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(device)
      logits = model(**tokenized_sentences).logits
      probs = softmax(logits)
    predictions = probs.max(dim=-1)
    idxs = [(id==lan_id and p>tol).item() for id, p in zip(predictions.indices, predictions.values)]
    del tokenized_sentences
    del logits
    del probs
    del predictions
    return pd.Series(idxs)

In [None]:
"""
Wrapper for the above function. Given a list of sentences, it returns a pd.Series
of boolean indexes: True for correct language, False otherwise.
"""
def check_lan(sentences: list, lan: str, model, tokenizer, 
              batch_size=32, tol: float=0.8) -> pd.Series:
    n_it = len(sentences) // batch_size
    rem = len(sentences) - n_it * batch_size
    idxs = pd.Series([])
    for idx in range(n_it):
        begin = idx*batch_size
        end = (idx+1)*batch_size
        idxs = idxs.append(check_lan_batch(sentences[begin:end], lan, model, tokenizer, tol=tol), ignore_index=True)
    idxs = idxs.append(check_lan_batch(sentences[-rem:], lan, model, tokenizer, tol=tol), ignore_index=True)
    return idxs

In [None]:
"""
Function to remove emojis, they will be replaced with a whitespace.
Sentence as input, filtered sentence in output.
"""
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, ' ', data)

In [None]:
"""
Given a sentence, it checks whether it begins with an URL
"""
def check_url_begin(sentence: str) -> bool:
    sentence = " ".join(sentence.split())
    return sentence[0:3] == "htt" or sentence[0:3] == "www"

In [None]:
"""
Remove URLs inside a sentence with regex.
"""
def remove_internal_url(sentence: str) -> str:
  return re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', sentence)

In [None]:
"""
Filters out HTML and \r escape characters (NaN source when parsing csv!!)
Then it converts back in natural language remaining HTML symbols. 
"""
def remove_html(sentence: str) -> str:
  soup = BeautifulSoup(sentence, 'html.parser')
  sentence = soup.get_text()
  sentence = re.sub(r'\r', ' ', sentence)
  return html.unescape(sentence)

In [None]:
"""
Wrapper for the cleaning pipeline. The ratio is: 
- For both the translation and the source text:
-- Remove HTML text
--- Remove emojis
---- Filter out sentences that begin with a URL (i.e. they contain only an URL)
----- Remove internal URLs
------ Perform language identification and filter out wrong sentences
"""
def clean_df(df: pd.DataFrame, src_lan: str, trg_lan: str, 
             model, tokenizer) -> pd.DataFrame:
  for column, lan in zip(("original", "translation"), (src_lan, trg_lan)):
    df[column] = df[column].apply(remove_html)
    df[column] = df[column].apply(remove_emojis)
    idxs = df[column].apply(check_url_begin)
    df = df.loc[~idxs].reset_index(drop=True)
    df[column] = df[column].apply(remove_internal_url)
    idxs = check_lan(df[column].tolist(), lan=lan, model=model, tokenizer=tokenizer)
    df = df.loc[idxs].reset_index(drop=True)
  return df



In [None]:
cleaned_df = clean_df(df, src_lan='it', trg_lan='en', model=model, tokenizer=tokenizer)

  soup = BeautifulSoup(sentence, 'html.parser')
  soup = BeautifulSoup(sentence, 'html.parser')


inizio


  idxs = pd.Series([])
  idxs = idxs.append(check_lan_batch(sentences[begin:end], lan, model, tokenizer, tol=tol), ignore_index=True)
  idxs = idxs.append(check_lan_batch(sentences[-rem:], lan, model, tokenizer, tol=tol), ignore_index=True)


fine
inizio
fine


In [None]:
cleaned_df.to_csv('./it-en-cleaned.csv') # export the csv

In [None]:
!nvidia-smi 

Thu Mar 23 16:32:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    32W /  70W |   3959MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces