# Text anonymization

Альперович Вадим <br>
ИАД21

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook

import warnings
warnings.simplefilter('ignore')
sns.set(style="darkgrid")

# 1. Data loading

https://tatianashavrina.github.io/2018/08/30/datasets/#ner

https://github.com/natasha/nerus

 *Nerus is a large silver standard Russian corpus annotated with POS tags, syntax trees and NER tags (PER, LOC, ORG).*

In [2]:
# !pip install nerus
# !pip install ipymarkup

In [3]:
from nerus import load_nerus
from ipymarkup import show_box_markup, span
from ipymarkup.span import Span


docs = load_nerus('../data/nerus_lenta.conllu.gz')
# doc = next(docs)
for i, doc in enumerate(docs):
    if i > 1:
        break
    show_box_markup(doc.ner.text, doc.ner.spans)

# 2. SpaCy

https://towardsdatascience.com/nlp-approaches-to-data-anonymization-1fb5bde6b929

In [None]:
import spacy
import spacy.displacy as displacy


In [5]:
docs = load_nerus('../data/nerus_lenta.conllu.gz')
nlp = spacy.load('ru_core_news_sm')
for i, doc in enumerate(docs):
    if i > 1:
        break
    sents = ''
    for sent in doc.sents:
        sents += sent.text
    _doc = nlp(sents)
    displacy.render(_doc, style='ent')

In [171]:
nlp = spacy.load('ru_core_news_sm')
def anonymize(text, anon_tok='ANONYMIZED', nlp=nlp):
    doc = nlp(text)
    spans = []
    diff = 0
    for ent in doc.ents:
        span = Span(start=ent.start_char + diff, 
                    stop=ent.start_char + diff+len(anon_tok), 
                    type=ent.label_)
        spans.append(span)
        text = text.replace(ent.text, anon_tok)
        diff += len(anon_tok) - len(ent.text)
    return text, spans

In [7]:
docs = load_nerus('../data/nerus_lenta.conllu.gz')


for i, doc in enumerate(docs):
    if i > 2:
        break
    sents = ''
    for sent in doc.sents:
        sent = sent.text
        anon_text, spans = anonymize(sent)
        show_box_markup(anon_text, spans)

# 3. Presidio

In [8]:
# !pip install presidio-analyzer

In [507]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.predefined_recognizers import DateRecognizer

configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "ru", "model_name": "ru_core_news_lg"},
               {"lang_code": "en", "model_name": "en_core_web_md"}],
}


provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()


analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                          supported_languages=["ru", "en"])

In [202]:
# help(DateRecognizer)

In [203]:
text = """
Hello, my name is David Johnson and I live in Maine.
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

On September 18 I visited microsoft.com and sent an email to test@presidio.site,  from the IP 192.168.0.1.

My passport: 191280342 and my phone number: (212) 555-1234.

This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?

Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.

"""

In [204]:
from presidio_anonymizer import AnonymizerEngine

anonymizer = AnonymizerEngine()

In [205]:
results = analyzer.analyze(text, language='en')
anon_text = anonymizer.anonymize(text, analyzer_results=results)
print(anon_text.text)


Hello, my name is <PERSON> and I live in <LOCATION>.
My credit card number is <CREDIT_CARD> and my crypto wallet id is <CRYPTO>.

On <DATE_TIME> I visited <DOMAIN_NAME> and sent an email to <EMAIL_ADDRESS>,  from the IP <IP_ADDRESS>.

My passport: <US_PASSPORT> and my phone number: <PHONE_NUMBER>.

This is a valid International Bank Account Number: <IBAN_CODE> . Can you please check the status on bank account <US_BANK_NUMBER>?

<PERSON>'s social security number is <US_SSN>.  Her driver license? it is <PERSON>


In [206]:
text = """
Здравствуйте, меня зовут Дэвид Джонсон, и я живу в штате Мэн.
Номер моей кредитной карты - 4095-2609-9393-4932, а идентификатор моего криптокошелька - 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

18 сентября я посетил сайт microsoft.com и отправил письмо на адрес test@presidio.site с IP 192.168.0.1.

Мой паспорт: 951244 и мой номер телефона: (212) 555-1234.

Это действительный номер счета в международном банке: IL150120690000003111111 . Не могли бы вы проверить состояние банковского счета 954567876544?

Номер социального страхования Кейт - 078-05-1126.  Ее водительские права - 1234567A.

Переведено с помощью www.DeepL.com/Translator (бесплатная версия)
"""

In [207]:
results = analyzer.analyze(text, language='ru')
anon_text = anonymizer.anonymize(text, analyzer_results=results)
print(anon_text.text)


Здравствуйте, меня зовут <PERSON>, и я живу в штате <LOCATION>.
Номер моей кредитной карты - <CREDIT_CARD>, а идентификатор моего криптокошелька - <CRYPTO>.

18 сентября я посетил сайт <DOMAIN_NAME> и отправил письмо на адрес <EMAIL_ADDRESS> с IP <IP_ADDRESS>.

Мой паспорт: 951244 и мой номер телефона: <PHONE_NUMBER>.

Это действительный номер счета в международном банке: <IBAN_CODE> . Не могли бы вы проверить состояние банковского счета 954567876544?

Номер социального страхования <PERSON>.  Ее водительские права - 1234567A.

Переведено с помощью <DOMAIN_NAME>/Translator (бесплатная версия)



# 4. HTML anonymization

In [991]:
from IPython import html

def get_box(text, color='green'):
#     return f"""<font color="{color}">[{text}]</font>"""
    return f"[{text}]"

In [992]:
from presidio_anonymizer.entities.engine import OperatorConfig


operators={"PERSON": OperatorConfig(operator_name="replace", 
                                    params={"new_value": get_box('PERSON', 'green')}),
           "LOCATION": OperatorConfig(operator_name="replace", 
                                      params={"new_value": get_box('LOC', 'orange')}),
           "ORGANIZATION": OperatorConfig(operator_name="replace", 
                                      params={"new_value": get_box('ORG', 'red')}),
           "ORG": OperatorConfig(operator_name="replace", 
                                      params={"new_value": get_box('ORG', 'red')}),
          }

In [1009]:
from bs4 import BeautifulSoup, NavigableString
import requests

url = "https://ria.ru/20211102/kino-1757235169.html"

html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
for x in soup.find_all('a'):
    new_tag = soup.new_tag('a')
    if x.string:
        new_tag = x.string 
    x.replace_with(new_tag)
html_content = str(soup)

In [1013]:
def anonymize_text(text):
    results = analyzer.analyze(text, language='ru')
    anon_text = anonymizer.anonymize(text, 
                                     analyzer_results=results,
                                     operators=operators)
    return anon_text.text

def anonymize_html_object(html_obj, *args):
    for x in list(html_obj.strings):
        x.replaceWith(anonymize_text(str(x)))
    return None

In [1014]:
title = soup.find(attrs={'class': 'article__title'})
second_title = soup.find(attrs={'class': 'article__second-title'})
body = soup.find_all('div', attrs={'class': 'article__text'})

In [1015]:
try:
    html_content = anonymize_html_object(title, html_content)
except Exception as e:
    print(e)
try:
    html_content = anonymize_html_object(second_title, html_content)
except Exception as e:
    print(e)

'NoneType' object has no attribute 'strings'


In [1016]:
for content in body:
    html_content = anonymize_html_object(content, html_content)

In [1022]:
with open(f"театр.html", "w", encoding='utf8') as f:
    f.write(str(soup))
#     f.write(html_content)