# Initial Dataset Creation MDebertaTokenizer

In [153]:
import datasets

fr = datasets.load_dataset("ai4privacy/pii-masking-200k", data_files=["french_pii_62k.jsonl"])
de = datasets.load_dataset("ai4privacy/pii-masking-200k", data_files=["german_pii_52k.jsonl"])
it = datasets.load_dataset("ai4privacy/pii-masking-200k", data_files=["italian_pii_50k.jsonl"])
en = datasets.load_dataset("ai4privacy/pii-masking-200k", data_files=["english_pii_43k.jsonl"])

KeyboardInterrupt: 

In [3]:
fr['train'][0]

{'masked_text': "Cher [PREFIX_1] [LASTNAME_1], nous organisons un programme d'alphabétisation à [CITY_1] en collaboration avec [COMPANYNAME_1]. Contactez [EMAIL_1] pour plus de détails.",
 'unmasked_text': "Cher Ms. Keebler, nous organisons un programme d'alphabétisation à West Shemar en collaboration avec Morissette - Russel. Contactez Hulda44@yahoo.com pour plus de détails.",
 'privacy_mask': "{'[PREFIX_1]': 'Ms.', '[LASTNAME_1]': 'Keebler', '[CITY_1]': 'West Shemar', '[COMPANYNAME_1]': 'Morissette - Russel', '[EMAIL_1]': 'Hulda44@yahoo.com'}",
 'span_labels': "[[0, 5, 'O'], [5, 8, 'PREFIX_1'], [8, 9, 'O'], [9, 16, 'LASTNAME_1'], [16, 67, 'O'], [67, 78, 'CITY_1'], [78, 101, 'O'], [101, 120, 'COMPANYNAME_1'], [120, 132, 'O'], [132, 149, 'EMAIL_1'], [149, 171, 'O']]",
 'bio_labels': ['O',
  'B-PREFIX',
  'I-PREFIX',
  'B-LASTNAME',
  'I-LASTNAME',
  'I-LASTNAME',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-CITY',
  'I-CITY',
  'I-CITY

In [4]:
# data
# df = data['train'].to_pandas()
# df = df.dropna().reset_index(drop=True)
# train = datasets.Dataset.from_pandas(df)
# dataset = datasets.DatasetDict({"train": train})

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [6]:
from tqdm.notebook import tqdm

In [20]:
def pii_text_spans_from_masked_text(masked_text, privacy_mask, include_outside = True):
    i = 0
    span_indices = []
    original_text = ""
    last_end = 0
    # cleaned_str = privacy_mask.replace("'", '"').replace("\\", "")
    privacy_mask = eval(privacy_mask)
    # print(privacy_mask)
    while i < len(masked_text):
        found = False
        for mask_key, mask_value in privacy_mask.items():
            if masked_text[i:].startswith(mask_key):
                original_text += masked_text[last_end:i]  # Add text before this mask
                
                # Add 'O' span_indices for the text between masks
                if i != last_end and include_outside:
                    span_indices.append([len(original_text) - (i - last_end), len(original_text), 'O'])
                
                # Add the mask value
                original_text += mask_value
                span_indices.append([len(original_text) - len(mask_value), len(original_text), mask_key.replace('[', '').replace(']', '')])

                last_end = i + len(mask_key)
                i += len(mask_key)
                found = True
                break

        if not found:
            i += 1
            
    # Add any remaining text after the last mask
    if last_end < len(masked_text) and include_outside:
        original_text += masked_text[last_end:]
        span_indices.append([len(original_text) - (len(masked_text) - last_end), len(original_text), 'O'])

    return original_text, span_indices

def compute_tokens_and_bio_labels(masked_text, privacy_mask, unmasked_text, tokenizer):
    masked_text, pii_spans = pii_text_spans_from_masked_text(masked_text, privacy_mask, include_outside = False)
    encoded = tokenizer.encode_plus(unmasked_text, return_offsets_mapping=True, add_special_tokens=False)
    token_spans = encoded["offset_mapping"]
    tokenized_unmasked_text_export = [tokenizer.decode([token_id]) for token_id in encoded["input_ids"]]
    pii_index = 0
    labels = ["O" for _ in token_spans]
    for i, token_span in enumerate(token_spans):
        #Sample token span(0, 2)
        #Sample pii span [0, 3, 'B-PREFIX_1']
        start = token_span[0]
        end = token_span[1]
        
        if start == pii_spans[pii_index][0]:
            labels[i] = f"B-{pii_spans[pii_index][2].split('_')[0]}"
        
        if start > pii_spans[pii_index][0] and start < pii_spans[pii_index][1]:
            labels[i] = f"I-{pii_spans[pii_index][2].split('_')[0]}"
        
        if start > pii_spans[pii_index][1]:
            pii_index += 1
            if pii_index >= len(pii_spans):
                #All the other tokens will be "O"
                break
            if start == pii_spans[pii_index][0]:
                labels[i] = f"B-{pii_spans[pii_index][2].split('_')[0]}"

    return labels, tokenized_unmasked_text_export

def process_row(row):
    try:
        bio_labels, tokenised_text = compute_tokens_and_bio_labels(
            row["masked_text"],
            row["privacy_mask"],
            row["unmasked_text"],
            tokenizer
        )

        # Update the row with new fields
        row["bio_labels"] = bio_labels
        row["tokenised_text"] = tokenised_text

        return row

    except Exception as e:
        # Handle exceptions here
        print("!!!!!Error processing row:", e)
        return None

In [15]:
df = fr['train'].to_pandas()
df = df.dropna().reset_index(drop=True)


masked_text = df['masked_text'].tolist()
privacy_mask = df['privacy_mask'].tolist()
unmasked_text = df['unmasked_text'].tolist()

bio_labels, tokenised_texts = [], []
for i in tqdm(range(len(masked_text))):
    bio_label, tokenised_text = compute_tokens_and_bio_labels(
        masked_text[i],
        privacy_mask[i],
        unmasked_text[i],
        tokenizer
    )
    bio_labels.append(bio_label)
    tokenised_texts.append(tokenised_text)

df['bio_labels'] = bio_labels
df['tokenised_text'] = tokenised_texts

fr_df = df.__deepcopy__()

In [32]:
df = en['train'].to_pandas()
df = df.dropna().reset_index(drop=True)


masked_text = df['masked_text'].tolist()
privacy_mask = df['privacy_mask'].tolist()
unmasked_text = df['unmasked_text'].tolist()

bio_labels, tokenised_texts = [], []
for i in tqdm(range(len(masked_text))):
    bio_label, tokenised_text = compute_tokens_and_bio_labels(
        masked_text[i],
        privacy_mask[i],
        unmasked_text[i],
        tokenizer
    )
    bio_labels.append(bio_label)
    tokenised_texts.append(tokenised_text)

df['bio_labels'] = bio_labels
df['tokenised_text'] = tokenised_texts

en_df = df.__deepcopy__()

  0%|          | 0/43501 [00:00<?, ?it/s]

In [44]:
en_df[['language']] = 'en'
en_df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,A student's assessment was found on device bea...,A student's assessment was found on device bea...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, 'O'], [57, 75, 'PHONEIMEI_1'], [75, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[A, student, ', s, , assessment, was, found, o...",en
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, 'O'], [5, 9, 'FIRSTNAME_1'], [9, 44, '...","[O, O, O, I-FIRSTNAME, O, O, O, O, O, O, O, O,...","[De, ar, Om, er, ,, as, per, our, , records, ,...",en
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, 'FIRSTNAME_1'], [6, 75, 'O'], [75, 77,...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[K, attie, , could, you, please, share, your, ...",en
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, 'O'], [22, 27, 'BUILDINGNUMBER_1'], [...","[O, O, O, O, O, O, I-BUILDINGNUMBER, O, O, O, ...","[, Emergency, , supplies, in, 16, 356, need, ,...",en
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, 'O'], [4, 6, 'AGE_1'], [6, 20, 'O'], [...","[O, O, B-AGE, O, O, O, O, I-BUILDINGNUMBER, O,...","[The, , 88, old, child, at, 5, 862, ,, has, , ...",en
...,...,...,...,...,...,...,...
43496,"Hello [FIRSTNAME_1], your cognitive therapy ap...","Hello Nellie, your cognitive therapy appointme...","{'[FIRSTNAME_1]': 'Nellie', '[DATE_1]': '8/21'...","[[0, 6, 'O'], [6, 12, 'FIRSTNAME_1'], [12, 66,...","[O, O, I-FIRSTNAME, O, O, O, O, O, O, O, O, O,...","[Hello, Nell, ie, ,, your, c, ognitive, , ther...",en
43497,"Dear [FIRSTNAME_1], we appreciate your active ...","Dear Jalon, we appreciate your active involvem...","{'[FIRSTNAME_1]': 'Jalon', '[CREDITCARDNUMBER_...","[[0, 5, 'O'], [5, 10, 'FIRSTNAME_1'], [10, 159...","[O, O, O, I-FIRSTNAME, O, O, O, O, O, O, O, O,...","[De, ar, Jal, on, ,, we, , appreciate, your, a...",en
43498,"Dear [SEX_1] at [ZIPCODE_1], we are raising fu...","Dear Female at 32363-2779, we are raising fund...","{'[SEX_1]': 'Female', '[ZIPCODE_1]': '32363-27...","[[0, 5, 'O'], [5, 11, 'SEX_1'], [11, 15, 'O'],...","[O, O, O, B-SEX, O, O, I-ZIPCODE, I-ZIPCODE, I...","[De, ar, , Female, at, 3, 2363, -, 2779, ,, we...",en
43499,"Hello [FIRSTNAME_1], we encourage you to pay t...","Hello Tito, we encourage you to pay the fees o...","{'[FIRSTNAME_1]': 'Tito', '[ETHEREUMADDRESS_1]...","[[0, 6, 'O'], [6, 10, 'FIRSTNAME_1'], [10, 137...","[O, O, B-FIRSTNAME, O, O, O, O, O, O, O, O, O,...","[Hello, , Tito, ,, we, , encourage, you, to, p...",en


In [45]:
de_df[['language']] = 'de'
de_df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,'Exkursionsupdate: Wir werden uns in [ORDINALD...,'Exkursionsupdate: Wir werden uns in Northeast...,"{'[ORDINALDIRECTION_1]': 'Northeast', '[NEARBY...","[[0, 37, 'O'], [37, 46, 'ORDINALDIRECTION_1'],...","[O, O, O, O, O, O, O, O, O, O, O, O, B-ORDINAL...","[, ', Ex, kur, sions, update, :, Wir, werden, ...",de
1,"Sehr geehrte[r] [PREFIX_1] [LASTNAME_1], wie u...","Sehr geehrte[r] Ms. Keeling, wie unsere Aufzei...","{'[PREFIX_1]': 'Ms.', '[LASTNAME_1]': 'Keeling...","[[0, 16, 'O'], [16, 19, 'PREFIX_1'], [19, 20, ...","[O, O, O, O, O, O, O, O, O, I-PREFIX, I-PREFIX...","[, Sehr, ge, ehrt, e, [, r, ], M, s, ., K, eel...",de
2,'Wir bestätigen noch einmal die Finanzen für d...,'Wir bestätigen noch einmal die Finanzen für d...,"{'[BIC_1]': 'MGJNCGF5XXX', '[IBAN_1]': 'BH13JD...","[[0, 105, 'O'], [105, 116, 'BIC_1'], [116, 130...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[, ', Wir, , bestätig, en, noch, ein, mal, die...",de
3,"[FIRSTNAME_1], wir haben eine Anfrage erhalten...","Deangelo, wir haben eine Anfrage erhalten, die...","{'[FIRSTNAME_1]': 'Deangelo', '[IBAN_1]': 'XK6...","[[0, 8, 'FIRSTNAME_1'], [8, 65, 'O'], [65, 85,...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[De, angelo, ,, wir, haben, eine, , Anfrage, ,...",de
4,"Finanzberichte des Patienten, die [AMOUNT_1] T...","Finanzberichte des Patienten, die 358k Transak...","{'[AMOUNT_1]': '358k', '[CREDITCARDISSUER_1]':...","[[0, 34, 'O'], [34, 38, 'AMOUNT_1'], [38, 68, ...","[O, O, O, O, O, O, O, I-AMOUNT, O, O, O, O, O,...","[Finanz, berichte, des, Patienten, ,, die, 358...",de
...,...,...,...,...,...,...,...
52812,"[FIRSTNAME_1], wir freuen uns auf Ihre Sitzung...","Baby, wir freuen uns auf Ihre Sitzung über Bew...","{'[FIRSTNAME_1]': 'Baby', '[JOBAREA_1]': 'Divi...","[[0, 4, 'FIRSTNAME_1'], [4, 102, 'O'], [102, 1...","[B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O, O,...","[Baby, ,, wir, freu, en, uns, auf, , Ihr, e, S...",de
52813,"Hallo [GENDER_1], wir machen große Fortschritt...","Hallo Transexual male, wir machen große Fortsc...","{'[GENDER_1]': 'Transexual male', '[AGE_1]': '...","[[0, 6, 'O'], [6, 21, 'GENDER_1'], [21, 131, '...","[O, O, I-GENDER, I-GENDER, O, O, O, O, O, O, O...","[Hallo, T, ransexual, male, ,, wir, machen, gr...",de
52814,Die palliative Versorgung für die [AGE_1]-Jähr...,Die palliative Versorgung für die 10 years old...,"{'[AGE_1]': '10 years old', '[DOB_1]': '11/87'}","[[0, 34, 'O'], [34, 46, 'AGE_1'], [46, 96, 'O'...","[O, O, O, O, O, O, O, I-AGE, I-AGE, O, O, O, O...","[Die, palli, ative, Versorgung, für, die, 10, ...",de
52815,Identifizierung einer potenziellen Risikoquell...,Identifizierung einer potenziellen Risikoquell...,"{'[CREDITCARDNUMBER_1]': '1791411812181579', '...","[[0, 52, 'O'], [52, 68, 'CREDITCARDNUMBER_1'],...","[O, O, O, O, O, O, O, O, O, O, O, B-CREDITCARD...","[Ident, ifizierung, , einer, , potenzi, ellen,...",de


In [46]:
fr_df[['language']] = 'fr'
fr_df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,"Cher [PREFIX_1] [LASTNAME_1], nous organisons ...","Cher Ms. Keebler, nous organisons un programme...","{'[PREFIX_1]': 'Ms.', '[LASTNAME_1]': 'Keebler...","[[0, 5, 'O'], [5, 8, 'PREFIX_1'], [8, 9, 'O'],...","[O, O, I-PREFIX, I-PREFIX, O, O, I-LASTNAME, O...","[Cher, M, s, ., Ke, e, bler, ,, nous, organis,...",fr
1,"Cher(e) [PREFIX_1] [LASTNAME_1], en tant qu'éc...","Cher(e) Mr. Mills, en tant qu'école, nous nous...","{'[PREFIX_1]': 'Mr.', '[LASTNAME_1]': 'Mills',...","[[0, 8, 'O'], [8, 11, 'PREFIX_1'], [11, 12, 'O...","[O, O, O, O, O, I-PREFIX, O, B-LASTNAME, O, O,...","[Cher, (, e, ), Mr, ., , Mills, ,, en, tant, q...",fr
2,"[PREFIX_1] [FIRSTNAME_1] [LASTNAME_1], concern...","Ms. Dandre O'Kon, concernant votre récente pro...","{'[PREFIX_1]': 'Ms.', '[FIRSTNAME_1]': 'Dandre...","[[0, 3, 'PREFIX_1'], [3, 4, 'O'], [4, 10, 'FIR...","[B-PREFIX, I-PREFIX, I-PREFIX, O, O, O, O, I-L...","[M, s, ., D, andre, O, ', Kon, ,, concern, ant...",fr
3,"Cher [PREFIX_1][PREFIX_2] [MIDDLENAME_1], le b...","Cher MissMiss Shawn, le briefing de sécurité a...","{'[PREFIX_1]': 'Miss', '[PREFIX_2]': 'Miss', '...","[[0, 5, 'O'], [5, 9, 'PREFIX_1'], [9, 13, 'PRE...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Cher, Miss, Miss, Shaw, n, ,, le, , brief, in...",fr
4,"[FIRSTNAME_1], en ce qui concerne l'emploi dan...","Joy, en ce qui concerne l'emploi dans le domai...","{'[FIRSTNAME_1]': 'Joy', '[JOBAREA_1]': 'Accou...","[[0, 3, 'FIRSTNAME_1'], [3, 49, 'O'], [49, 63,...","[B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O, O,...","[Joy, ,, en, ce, qui, , concerne, , l, ', empl...",fr
...,...,...,...,...,...,...,...
61953,Les conditions dermatologiques peuvent être af...,Les conditions dermatologiques peuvent être af...,"{'[AGE_1]': '44 years old', '[PREFIX_1]': 'Mis...","[[0, 64, 'O'], [64, 76, 'AGE_1'], [76, 95, 'O'...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Les, conditions, dermatologi, ques, pe, uvent...",fr
61954,"Cher étudiant, nous avons remarqué une connexi...","Cher étudiant, nous avons remarqué une connexi...","{'[IP_1]': '180.58.191.123', '[PASSWORD_1]': '...","[[0, 120, 'O'], [120, 134, 'IP_1'], [134, 215,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Cher, , é, tudiant, ,, nous, avon, s, remar, ...",fr
61955,Tous les appareils médicaux doivent être mis à...,Tous les appareils médicaux doivent être mis à...,"{'[DOB_1]': 'December 27, 1945', '[PASSWORD_1]...","[[0, 61, 'O'], [61, 78, 'DOB_1'], [78, 138, 'O...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[, Tous, les, , a, ppareil, s, médica, ux, do,...",fr
61956,Le prix du manuel de Santé Sexuelle et Reprodu...,Le prix du manuel de Santé Sexuelle et Reprodu...,"{'[CURRENCYSYMBOL_1]': '₪', '[CREDITCARDNUMBER...","[[0, 59, 'O'], [59, 60, 'CURRENCYSYMBOL_1'], [...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Le, prix, du, , manuel, de, Sant, é, Sex, u, ...",fr


In [47]:
it_df[['language']] = 'it'
it_df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,"Per l'impostazione della contabilità, dovremmo...","Per l'impostazione della contabilità, dovremmo...","{'[BIC_1]': 'PIWDTFOC', '[CURRENCYNAME_1]': 'C...","[[0, 65, 'O'], [65, 73, 'BIC_1'], [73, 96, 'O'...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Per, , l, ', impost, azione, , della, contabi...",it
1,"Caro [GENDER_1], abbiamo organizzato una sessi...","Caro Male to female trans woman, abbiamo organ...","{'[GENDER_1]': 'Male to female trans woman', '...","[[0, 5, 'O'], [5, 31, 'GENDER_1'], [31, 104, '...","[O, O, I-GENDER, I-GENDER, I-GENDER, I-GENDER,...","[Caro, Male, to, , female, trans, woman, ,, ab...",it
2,Invitiamo i [JOBTYPE_1] impegnati dello stato ...,Invitiamo i Agent impegnati dello stato di Cal...,"{'[JOBTYPE_1]': 'Agent', '[STATE_1]': 'Calabri...","[[0, 12, 'O'], [12, 17, 'JOBTYPE_1'], [17, 43,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, I-S...","[Invit, iamo, , i, Agent, , impegn, ati, , del...",it
3,Mi rivolgo a tutto il corpo docente. Stiamo fa...,Mi rivolgo a tutto il corpo docente. Stiamo fa...,"{'[TIME_1]': '19:37', '[USERAGENT_1]': 'Mozill...","[[0, 104, 'O'], [104, 109, 'TIME_1'], [109, 14...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Mi, , rivol, go, , a, , tutto, il, corpo, doc...",it
4,[FIRSTNAME_1] [LASTNAME_1] ha ricevuto un appu...,Tyree Hamill ha ricevuto un appuntamento speci...,"{'[FIRSTNAME_1]': 'Tyree', '[LASTNAME_1]': 'Ha...","[[0, 5, 'FIRSTNAME_1'], [5, 6, 'O'], [6, 12, '...","[B-FIRSTNAME, I-FIRSTNAME, I-FIRSTNAME, O, O, ...","[Tyr, e, e, Hamil, l, ha, ricev, uto, un, , ap...",it
...,...,...,...,...,...,...,...
50980,"La nostra università, situata in [STATE_1], si...","La nostra università, situata in Marche, si pr...","{'[STATE_1]': 'Marche', '[JOBTITLE_1]': 'Legac...","[[0, 33, 'O'], [33, 39, 'STATE_1'], [39, 120, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[La, , nostra, universit, à, ,, situat, a, in,...",it
50981,Allegato è lo scontrino per il programma educa...,Allegato è lo scontrino per il programma educa...,"{'[CURRENCYNAME_1]': 'New Israeli Sheqel', '[C...","[[0, 110, 'O'], [110, 128, 'CURRENCYNAME_1'], ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Alle, gato, , è, lo, scont, rino, per, il, pr...",it
50982,Non dimenticare di segnalare i progressi di ad...,Non dimenticare di segnalare i progressi di ad...,"{'[ZIPCODE_1]': '09318-1647', '[DOB_1]': '1915...","[[0, 81, 'O'], [81, 91, 'ZIPCODE_1'], [91, 101...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Non, dimentic, are, di, , s, egnala, re, , i,...",it
50983,"[GENDER_1], abbiamo elaborato la tua richiesta...","Male to female transgender woman, abbiamo elab...",{'[GENDER_1]': 'Male to female transgender wom...,"[[0, 32, 'GENDER_1'], [32, 140, 'O'], [140, 15...","[B-GENDER, I-GENDER, I-GENDER, I-GENDER, I-GEN...","[Male, to, , female, , t, ransgender, woman, ,...",it


In [48]:
import pandas as pd

full_df = pd.concat([en_df, de_df, fr_df, it_df])

In [51]:
full_df = full_df.dropna()
full_df = full_df.sample(frac=1)
full_df.reset_index(drop=True, inplace=True)

In [52]:
full_df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,Mi interessa saperne di più sull'Apprendimento...,Mi interessa saperne di più sull'Apprendimento...,"{'[URL_1]': 'https://spherical-reject.org', '[...","[[0, 111, 'O'], [111, 139, 'URL_1'], [139, 149...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Mi, interessa, sa, perne, di, p, iù, sull, ',...",it
1,Der IT-Flügel der Bibliothek hat die IP-Adress...,Der IT-Flügel der Bibliothek hat die IP-Adress...,{'[IP_1]': '6fec:b1ca:dae8:fe6c:84c6:fdd3:623f...,"[[0, 49, 'O'], [49, 88, 'IP_1'], [88, 93, 'O']...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Der, IT, -, F, lügel, der, , Bibliothek, hat,...",de
2,"Salut [FIRSTNAME_1], veuillez confirmer si les...","Salut Tatyana, veuillez confirmer si les 4 der...","{'[FIRSTNAME_1]': 'Tatyana', '[MASKEDNUMBER_1]...","[[0, 6, 'O'], [6, 13, 'FIRSTNAME_1'], [13, 111...","[O, O, I-FIRSTNAME, O, O, O, O, O, O, O, O, O,...","[Salut, Tat, yana, ,, v, euille, z, confirm, e...",fr
3,Student [SSN_1] - herzlichen Glückwunsch zu Ih...,Student 600-48-9661 - herzlichen Glückwunsch z...,"{'[SSN_1]': '600-48-9661', '[IPV4_1]': '233.40...","[[0, 8, 'O'], [8, 19, 'SSN_1'], [19, 178, 'O']...","[O, O, I-SSN, I-SSN, I-SSN, O, O, O, O, O, O, ...","[Student, 600, -48, -, 9661, , -, herz, lichen...",de
4,La conférence internationale sur l'éducation m...,La conférence internationale sur l'éducation m...,"{'[BUILDINGNUMBER_1]': '992', '[STREET_1]': 'W...","[[0, 67, 'O'], [67, 70, 'BUILDINGNUMBER_1'], [...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[La, c, onférence, internationale, sur, , l, '...",fr
...,...,...,...,...,...,...,...
209256,Ricevuta conferma di elaborare le scansioni tr...,Ricevuta conferma di elaborare le scansioni tr...,"{'[ACCOUNTNAME_1]': 'Home Loan Account', '[URL...","[[0, 55, 'O'], [55, 72, 'ACCOUNTNAME_1'], [72,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, I-A...","[Rice, vuta, confer, ma, di, elabora, re, le, ...",it
209257,Si quelqu'un a vu un chat aux yeux verts mesur...,Si quelqu'un a vu un chat aux yeux verts mesur...,"{'[HEIGHT_1]': '117cm', '[ORDINALDIRECTION_1]'...","[[0, 58, 'O'], [58, 63, 'HEIGHT_1'], [63, 117,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Si, que, lqu, ', un, , a, vu, un, chat, aux, ...",fr
209258,"Bonjour [FIRSTNAME_1], votre dernier paiement ...","Bonjour Ronaldo, votre dernier paiement pour l...","{'[FIRSTNAME_1]': 'Ronaldo', '[ACCOUNTNAME_1]'...","[[0, 8, 'O'], [8, 15, 'FIRSTNAME_1'], [15, 87,...","[O, O, O, B-FIRSTNAME, O, O, O, O, O, O, O, O,...","[, Bonjour, , Ronaldo, ,, v, otre, de, rnier, ...",fr
209259,Investment opportunities provided by [COMPANYN...,Investment opportunities provided by Johnston ...,"{'[COMPANYNAME_1]': 'Johnston and Sons', '[CUR...","[[0, 37, 'O'], [37, 54, 'COMPANYNAME_1'], [54,...","[O, O, O, O, O, O, O, I-COMPANYNAME, I-COMPANY...","[Investment, , opportunities, , provided, by, ...",en


In [58]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=42)

train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [61]:
train_df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,"""Hey [FIRSTNAME_1], for tomorrow's field trip,...","""Hey Madilyn, for tomorrow's field trip, the p...","{'[FIRSTNAME_1]': 'Madilyn', '[CURRENCY_1]': '...","[[0, 5, 'O'], [5, 12, 'FIRSTNAME_1'], [12, 70,...","[O, O, O, I-FIRSTNAME, O, O, O, O, O, O, O, O,...","["", Hey, Madi, lyn, ,, for, , tomorrow, ', s, ...",en
1,"[FIRSTNAME_1], nous vous demandons votre [IPV6...","Daniella, nous vous demandons votre b658:ff83:...","{'[FIRSTNAME_1]': 'Daniella', '[IPV6_1]': 'b65...","[[0, 8, 'FIRSTNAME_1'], [8, 36, 'O'], [36, 75,...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[Daniel, la, ,, nous, , vous, demand, ons, v, ...",fr
2,"Ein weiterer digitaler Fußabdruck, der bei Fer...","Ein weiterer digitaler Fußabdruck, der bei Fer...",{'[USERAGENT_1]': 'Mozilla/5.0 (compatible; MS...,"[[0, 85, 'O'], [85, 148, 'USERAGENT_1'], [148,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Ein, weiter, er, digital, er, Fuß, ab, druck,...",de
3,"Liebe Bewohner von [STREET_1], verpassen Sie n...","Liebe Bewohner von Vincenzo Plain, verpassen S...","{'[STREET_1]': 'Vincenzo Plain', '[TIME_1]': '...","[[0, 19, 'O'], [19, 33, 'STREET_1'], [33, 89, ...","[O, O, O, O, O, I-STREET, I-STREET, O, O, O, O...","[Liebe, Be, wohner, von, Vincen, zo, Plain, ,,...",de
4,"Monsieur [MIDDLENAME_1], votre bureau pour exa...","Monsieur Emerson, votre bureau pour examiner l...","{'[MIDDLENAME_1]': 'Emerson', '[SECONDARYADDRE...","[[0, 9, 'O'], [9, 16, 'MIDDLENAME_1'], [16, 83...","[O, O, O, I-MIDDLENAME, O, O, O, O, O, O, O, O...","[Mon, sieur, E, merson, ,, v, otre, bureau, po...",fr
...,...,...,...,...,...,...,...
167403,Le patient [ACCOUNTNAME_1] a montré des symptô...,Le patient Investment Account a montré des sym...,"{'[ACCOUNTNAME_1]': 'Investment Account', '[PH...","[[0, 11, 'O'], [11, 29, 'ACCOUNTNAME_1'], [29,...","[O, O, O, I-ACCOUNTNAME, O, O, O, O, O, O, O, ...","[Le, patient, Investment, Account, , a, montr,...",fr
167404,Bitte tätigen Sie die Zahlung für die neuen Ge...,Bitte tätigen Sie die Zahlung für die neuen Ge...,"{'[BIC_1]': 'KAUPTDSBXXX', '[CURRENCY_1]': 'Ky...","[[0, 54, 'O'], [54, 65, 'BIC_1'], [65, 70, 'O'...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, I-B...","[Bitte, , tätig, en, Sie, die, Zahlung, für, d...",de
167405,Tanken Sie Ihr Auto [VEHICLEVRM_1] und kommen ...,Tanken Sie Ihr Auto PA17UNK und kommen Sie zum...,"{'[VEHICLEVRM_1]': 'PA17UNK', '[CITY_1]': 'Rey...","[[0, 20, 'O'], [20, 27, 'VEHICLEVRM_1'], [27, ...","[O, O, O, O, O, O, O, I-VEHICLEVRM, I-VEHICLEV...","[Tank, en, Sie, , Ihr, Auto, PA, 17, UNK, und,...",de
167406,"Ciao, sto chiamando in merito a un workshop ch...","Ciao, sto chiamando in merito a un workshop ch...","{'[JOBTYPE_1]': 'Supervisor', '[NEARBYGPSCOORD...","[[0, 76, 'O'], [76, 86, 'JOBTYPE_1'], [86, 121...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Cia, o, ,, sto, , chiama, ndo, in, merito, , ...",it


In [62]:
test_df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,Appena ricevuta una chiamata dall'ufficio di [...,Appena ricevuta una chiamata dall'ufficio di M...,"{'[STATE_1]': 'Michigan', '[FIRSTNAME_1]': 'Pa...","[[0, 45, 'O'], [45, 53, 'STATE_1'], [53, 66, '...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[, Appen, a, ricev, uta, una, chi, amata, dall...",it
1,"Aggiornamento risorse: Per favore, consultate ...","Aggiornamento risorse: Per favore, consultate ...","{'[URL_1]': 'https://honorable-journal.name/',...","[[0, 46, 'O'], [46, 77, 'URL_1'], [77, 108, 'O...","[O, O, O, O, O, O, O, O, O, O, O, O, O, I-URL,...","[A, ggiornamento, , risor, se, :, Per, , favor...",it
2,"Dear [JOBTITLE_1], an online payment of [CURRE...","Dear Regional Optimization Architect, an onlin...",{'[JOBTITLE_1]': 'Regional Optimization Archit...,"[[0, 5, 'O'], [5, 36, 'JOBTITLE_1'], [36, 59, ...","[O, O, O, I-JOBTITLE, I-JOBTITLE, I-JOBTITLE, ...","[De, ar, Regional, O, ptimization, Architect, ...",en
3,"Bonjour coordinateurs d'étudiants en échange, ...","Bonjour coordinateurs d'étudiants en échange, ...","{'[EMAIL_1]': 'Alena.Nikolaus@yahoo.com', '[NE...","[[0, 101, 'O'], [101, 125, 'EMAIL_1'], [125, 2...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[, Bonjour, coordinat, eurs, d, ', é, tudiant,...",fr
4,"[FIRSTNAME_1], specialista in [JOBAREA_1], ha ...","Van, specialista in Applications, ha chiamato ...","{'[FIRSTNAME_1]': 'Van', '[JOBAREA_1]': 'Appli...","[[0, 3, 'FIRSTNAME_1'], [3, 20, 'O'], [20, 32,...","[B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O, O,...","[Van, ,, , specialista, in, Applications, ,, h...",it
...,...,...,...,...,...,...,...
41848,Rappel : La planification successorale inclut ...,Rappel : La planification successorale inclut ...,{'[ETHEREUMADDRESS_1]': '0xbb72afb42e3aaec63c0...,"[[0, 66, 'O'], [66, 108, 'ETHEREUMADDRESS_1'],...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[R, appel, , :, La, plan, ification, success, ...",fr
41849,Abbiamo citato il tuo dispositivo nel nostro p...,Abbiamo citato il tuo dispositivo nel nostro p...,"{'[JOBAREA_1]': 'Communications', '[ZIPCODE_1]...","[[0, 54, 'O'], [54, 68, 'JOBAREA_1'], [68, 128...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Ab, biamo, citat, o, il, tuo, , dispositivo, ...",it
41850,Mon ancien numéro de compte [ACCOUNTNUMBER_1] ...,Mon ancien numéro de compte 79582818 semble av...,"{'[ACCOUNTNUMBER_1]': '79582818', '[CREDITCARD...","[[0, 28, 'O'], [28, 36, 'ACCOUNTNUMBER_1'], [3...","[O, O, O, O, O, O, O, O, O, I-ACCOUNTNUMBER, I...","[Mon, , ancien, , numér, o, de, compte, 7, 958...",fr
41851,Alternative medicine presentation scheduled fo...,Alternative medicine presentation scheduled fo...,"{'[DATE_1]': '10/06/2018', '[JOBTITLE_1]': 'Ce...","[[0, 48, 'O'], [48, 58, 'DATE_1'], [58, 62, 'O...","[O, O, O, O, O, O, O, I-DATE, O, O, I-JOBTITLE...","[Alternative, medicine, presentation, schedule...",en


In [63]:
train = datasets.Dataset.from_pandas(train_df)
test = datasets.Dataset.from_pandas(test_df)
dataset = datasets.DatasetDict({"train": train, "test": test})

In [64]:
dataset

DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language'],
        num_rows: 167408
    })
    test: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language'],
        num_rows: 41853
    })
})

In [65]:
dataset.push_to_hub("pii-masking-200k", token="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")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/168 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/42 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

# Sanitizing bio labels

In [1]:
import datasets

dataset = datasets.load_dataset("Isotonic/pii-masking-200k")

dataset

Downloading readme:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/94.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/167408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/41853 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language'],
        num_rows: 167408
    })
    test: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language'],
        num_rows: 41853
    })
})

In [22]:
import pandas as pd

dataset
tr = dataset['train'].to_pandas()
te = dataset['test'].to_pandas()
df = pd.concat([tr, te]).reset_index(drop=True)

In [23]:
df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,"""Hey [FIRSTNAME_1], for tomorrow's field trip,...","""Hey Madilyn, for tomorrow's field trip, the p...","{'[FIRSTNAME_1]': 'Madilyn', '[CURRENCY_1]': '...","[[0, 5, 'O'], [5, 12, 'FIRSTNAME_1'], [12, 70,...","[O, O, O, I-FIRSTNAME, O, O, O, O, O, O, O, O,...","["", Hey, Madi, lyn, ,, for, , tomorrow, ', s, ...",en
1,"[FIRSTNAME_1], nous vous demandons votre [IPV6...","Daniella, nous vous demandons votre b658:ff83:...","{'[FIRSTNAME_1]': 'Daniella', '[IPV6_1]': 'b65...","[[0, 8, 'FIRSTNAME_1'], [8, 36, 'O'], [36, 75,...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[Daniel, la, ,, nous, , vous, demand, ons, v, ...",fr
2,"Ein weiterer digitaler Fußabdruck, der bei Fer...","Ein weiterer digitaler Fußabdruck, der bei Fer...",{'[USERAGENT_1]': 'Mozilla/5.0 (compatible; MS...,"[[0, 85, 'O'], [85, 148, 'USERAGENT_1'], [148,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Ein, weiter, er, digital, er, Fuß, ab, druck,...",de
3,"Liebe Bewohner von [STREET_1], verpassen Sie n...","Liebe Bewohner von Vincenzo Plain, verpassen S...","{'[STREET_1]': 'Vincenzo Plain', '[TIME_1]': '...","[[0, 19, 'O'], [19, 33, 'STREET_1'], [33, 89, ...","[O, O, O, O, O, I-STREET, I-STREET, O, O, O, O...","[Liebe, Be, wohner, von, Vincen, zo, Plain, ,,...",de
4,"Monsieur [MIDDLENAME_1], votre bureau pour exa...","Monsieur Emerson, votre bureau pour examiner l...","{'[MIDDLENAME_1]': 'Emerson', '[SECONDARYADDRE...","[[0, 9, 'O'], [9, 16, 'MIDDLENAME_1'], [16, 83...","[O, O, O, I-MIDDLENAME, O, O, O, O, O, O, O, O...","[Mon, sieur, E, merson, ,, v, otre, bureau, po...",fr
...,...,...,...,...,...,...,...
209256,Rappel : La planification successorale inclut ...,Rappel : La planification successorale inclut ...,{'[ETHEREUMADDRESS_1]': '0xbb72afb42e3aaec63c0...,"[[0, 66, 'O'], [66, 108, 'ETHEREUMADDRESS_1'],...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[R, appel, , :, La, plan, ification, success, ...",fr
209257,Abbiamo citato il tuo dispositivo nel nostro p...,Abbiamo citato il tuo dispositivo nel nostro p...,"{'[JOBAREA_1]': 'Communications', '[ZIPCODE_1]...","[[0, 54, 'O'], [54, 68, 'JOBAREA_1'], [68, 128...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Ab, biamo, citat, o, il, tuo, , dispositivo, ...",it
209258,Mon ancien numéro de compte [ACCOUNTNUMBER_1] ...,Mon ancien numéro de compte 79582818 semble av...,"{'[ACCOUNTNUMBER_1]': '79582818', '[CREDITCARD...","[[0, 28, 'O'], [28, 36, 'ACCOUNTNUMBER_1'], [3...","[O, O, O, O, O, O, O, O, O, I-ACCOUNTNUMBER, I...","[Mon, , ancien, , numér, o, de, compte, 7, 958...",fr
209259,Alternative medicine presentation scheduled fo...,Alternative medicine presentation scheduled fo...,"{'[DATE_1]': '10/06/2018', '[JOBTITLE_1]': 'Ce...","[[0, 48, 'O'], [48, 58, 'DATE_1'], [58, 62, 'O...","[O, O, O, O, O, O, O, I-DATE, O, O, I-JOBTITLE...","[Alternative, medicine, presentation, schedule...",en


In [24]:
all_bio_labels = df.bio_labels.tolist()
old_bio_labels = all_bio_labels.copy()

In [25]:
# function that takes a list of bio tags, checks if they are valid, and sanitizes them if not.
# There are two types of invalid tags: 
# 1. if a I tag that is not preceded by a B tag
# 2. if the last tag is a I tag, check if its preceded by a B tag. If not, change it to a O tag.

def _sanitize_bio_labels(bio_tags):
    # Check for invalid tags that start with "I" but are not preceded by "B"
    for i in range(len(bio_tags)):
        tag = bio_tags[i]
        if tag != "O":
            tag = tag.split("-")[0]
            label = bio_tags[i].split("-")[1]
        if bio_tags[i].startswith("I") and bio_tags[i - 1].startswith("O"):
        # Remove the invalid I tag
            bio_tags[i - 1] = "B" + "-" + label

    # Check for invalid tags if the last tag is an I tag and not preceded by a B tag
    # if bio_tags[-1].startswith("I") and not bio_tags[-2].startswith("B"):
    #     # Replace the invalid I tag with an O tag
    #     bio_tags[-1] = "O" + "-" + label

    return bio_tags

In [26]:
from tqdm.notebook import tqdm

for i in tqdm(range(len(all_bio_labels))):
    all_bio_labels[i] = _sanitize_bio_labels(all_bio_labels[i])

  0%|          | 0/209261 [00:00<?, ?it/s]

In [27]:
import random

# choice = random.randint(0, len(all_bio_labels))
choice = 0
for t in zip(all_bio_labels[choice], old_bio_labels[choice]):
    print(t)

('O', 'O')
('O', 'O')
('B-FIRSTNAME', 'B-FIRSTNAME')
('I-FIRSTNAME', 'I-FIRSTNAME')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('B-CURRENCY', 'B-CURRENCY')
('I-CURRENCY', 'I-CURRENCY')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('B-IBAN', 'B-IBAN')
('I-IBAN', 'I-IBAN')
('I-IBAN', 'I-IBAN')
('I-IBAN', 'I-IBAN')
('I-IBAN', 'I-IBAN')
('I-IBAN', 'I-IBAN')
('I-IBAN', 'I-IBAN')
('I-IBAN', 'I-IBAN')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
('O', 'O')
