## Illustrative Notebook : Privacy Leakage in NLP 

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import gensim
import warnings
from PrivacyLeakWE import *
warnings.filterwarnings('ignore')

#### Situation 1 : We imagine that a part of the dataset used by the user is available online and that the attacker know the entire online dataset

#### For this case we have considered a translation dataset from http://www.manythings.org/anki/ mapping english sentences to french sentences.

In [234]:
raw_df=pd.read_csv('english_french.txt',delimiter='\t',encoding='utf-8')
raw_df.columns=['English','French','Licence']
raw_df=raw_df.drop(['Licence'],axis=1)
raw_df.sample(10)

Unnamed: 0,English,French
48972,He used a lot of honey.,Il employait beaucoup de miel.
24544,He is on the radio.,Il passe à la radio.
133538,I'll spend Christmas with my family.,Je passerai Noël avec ma famille.
170923,You don't have to talk about it if you don't w...,Vous n'êtes pas obligés d'en parler si vous ne...
59823,Tom put on his slippers.,Tom mit ses chaussons.
83061,You don't look very strong.,Vous n'avez pas l'air très forts.
80746,That boy displayed no fear.,Ce garçon n'a montré aucune crainte.
75136,What are you doing Monday?,Qu'est-ce que vous faites lundi ?
119109,Is it cheaper to call after nine?,"Est-ce moins cher, d'appeler après neuf heures ?"
130224,The walls were painted light brown.,Les murs étaient peints de couleur ocre.


In [237]:
clean_data=pd.DataFrame(vpunc(vlower(raw_df)))
clean_data.columns=raw_df.columns

print(f'Dataset shape : {clean_data.shape}')

Dataset shape : (179903, 2)


#### Now the dataset is cleaned, we are going to embed english and french sentences

In [251]:
np.random.seed(0)

english=clean_data.English.drop_duplicates()
private_english_data=english
attacker_english_data=(english.sample(frac=0.5))  #a part of the user dataset is public and known by the attacker

french=clean_data.French.drop_duplicates()
private_french_data=french
attacker_french_data=(french.sample(frac=0.5)) #a part of the user dataset is public and known by the attacker

In [252]:
#Preprocessing consisting in splitting sentences into sequences of words
private_english_input=(private_english_data.apply(lambda x: x.split(" ")).values)
attacker_english_input=(attacker_english_data.apply(lambda x: x.split(" ")).values)

private_french_input=(private_french_data.apply(lambda x: x.split(" ")).values)
attacker_french_input=(attacker_french_data.apply(lambda x: x.split(" ")).values)

In [253]:
#private and attacker embeddings for english and french sentences
private_english_model = gensim.models.Word2Vec(sentences=private_english_input)
attacker_english_model = gensim.models.Word2Vec(sentences=attacker_english_input)

private_french_model = gensim.models.Word2Vec(sentences=private_french_input)
attacker_french_model = gensim.models.Word2Vec(sentences=attacker_french_input)

In [342]:
# private_french_model.wv.vocab.keys()

#### Now, we can already test if some sensitive sentence can be recovered by the attacker

In [344]:
privacy_leak('my doctor said that I only have a few days left to live',attacker_english_model,private_english_model)
print('\n')
privacy_leak('My son was ill yesterday',attacker_english_model,private_english_model)
print('\n')
privacy_leak('Il se sent seul',attacker_french_model,private_french_model)
print('\n')
privacy_leak('Votre mot de passe est facile' ,attacker_french_model,private_french_model)

[('my', 0.728559672832489)]
[('doctor', 0.5037205219268799)]
[('happened', 0.6517443656921387)]
[('that', 0.4703316390514374)]
[('i', 0.6370489597320557)]
[('only', 0.6262897253036499)]
[('have', 0.6257230639457703)]
[('a', 0.5301159620285034)]
[('few', 0.532633900642395)]
[('years', 0.5427626371383667)]
[('left', 0.6377806663513184)]
[('to', 0.6008734703063965)]
[('live', 0.5902172327041626)]


[('my', 0.728559672832489)]
[('father', 0.5695924758911133)]
[('was', 0.6523784399032593)]
[('ill', 0.5014663934707642)]
[('yesterday', 0.6793800592422485)]


[('il', 0.6450693607330322)]
[('se', 0.5954228043556213)]
[('sens', 0.5272761583328247)]
[('seul', 0.5937025547027588)]


[('votre', 0.6981475353240967)]
[('mot', 0.47707298398017883)]
[('de', 0.6065523028373718)]
[('passe', 0.47143352031707764)]
[('est', 0.6657159328460693)]
[('facile', 0.4625858664512634)]


##### We can also determine the percentage of words recovered by the attacker considering that a word is recovered when he belongs to the k first most probable words proposed by the attacker model.

In [257]:
results=pd.DataFrame()
english=[]
french=[]

for topn in range(1,6):
    english.append(attack_efficiency(attacker_english_model,private_english_model,topn=topn))
    french.append(attack_efficiency(attacker_french_model,private_french_model,topn=topn))

results['topn']=list(range(1,6))
results['Accuracy_english_dataset']=english
results['Accuracy_french_dataset']=french
results

Unnamed: 0,topn,Accuracy_english_dataset,Accuracy_french_dataset
0,1,0.08,0.1
1,2,0.11,0.14
2,3,0.12,0.16
3,4,0.14,0.18
4,5,0.15,0.19


In [350]:
len(private_english_model.wv.vocab.keys())*0.15

825.75

#### For the english sentences, we see that we can recover between 10% (550 words) and 15%(826 words) of the private vocabulary. This may seem low but as we have seen before, it is enough for capturing sensitive information !

In [351]:
#Save the embeddings
# private_english_model.save('private_english_model_0.5.model')
# private_french_model.save('private_french_model_0.5.model')

In [None]:
from gensim.models import Word2Vec
#Load our embeddings
filename='private_french_model_0.5.model'
model = gensim.models.Word2Vec.load(filename)

#### Situation 2 - The training dataset is entirely private and the attacker has at his disposal a public dataset from the same distribution (for instance sentences of same natures).

#### For simulating this situation we will consider the quora pair dataset which is a set of questions pairs that are potentially duplicates in the sense they express the same question without being exactly formulated the same way. We will consider a random sample of the first questions as our private user dataset and a random sample of the second questions as our attacker dataset. We can imagine that the attacker has made himself this dataset collecting questions in quora.com.

In [354]:
raw_df=pd.read_csv('quora.csv',encoding='utf-8')
private_df=raw_df.sample(frac=0.7).question1.values
attack_df=raw_df.sample(frac=0.5).question2.values

In [370]:
raw_df.groupby(['is_duplicate']).count()['id']

is_duplicate
0    255027
1    149263
Name: id, dtype: int64

##### As we can observe, there are around twice more non duplicates questions than duplicates ones. Hence, our study is reasonable since the dataset are not completely correlated. We can show one case where two questions are completely not duplicates to illustrate that point.

In [375]:
print(raw_df[raw_df['id']==3].question1.values)
print(raw_df[raw_df['id']==3].question2.values)

['Why am I mentally very lonely? How can I solve it?']
['Find the remainder when [math]23^{24}[/math] is divided by 24,23?']


##### As we see, these are two questions completely different. However, there are also a lot of questions that are similar but expressed differently.

In [390]:
raw_df.sample(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
254450,254450,335896,369171,What books were removed from the Bible?,What books are missing from the bible?,1
245940,245940,358885,358886,What is the full life cycle of an enterprise s...,What is full life cycle?,0
48460,48460,86397,86398,What does it mean when the dispatch says bith ...,What does it mean when the dispatch says both ...,1
347578,347578,476062,476063,What are some good things to put on the side o...,Which is the best business 4G data only plan f...,0
95299,95299,158972,158973,What kind of Saree will be suitable for my col...,What is the best farewell speech given by a ju...,0
377690,377690,509022,509023,What is the easiest way to work out fractions?,How do you work out limits of fractions?,0
375689,375689,506738,506739,How do I play billiards well?,Where and when was billiards first played?,0
352235,352235,481144,40044,Which is the less harmful cigarette brand?,Which brands of cigarettes sold in the US cont...,0
181945,181945,278600,278601,What causes facial dimples?,Why do some people have dimples?,1
60766,60766,106219,106220,I lost my original charger of the OnePlus One....,I have a Moto G 2nd gen. Usually in the day I ...,0


In [355]:
private_data=pd.DataFrame(vpunc(vlower(private_df)))
attack_data=pd.DataFrame(vpunc(vlower(attack_df)))
private_data.columns=['sentence']
attack_data.columns=['sentence']

In [356]:
np.random.seed(0)

private_english_data=private_data.sentence.drop_duplicates()
attacker_english_data=attack_data.sentence.drop_duplicates()

private_english_input=(private_english_data.apply(lambda x: x.split(" ")).values)
attacker_english_input=(attacker_english_data.apply(lambda x: x.split(" ")).values)

private_english_model = gensim.models.Word2Vec(sentences=private_english_input)
attacker_english_model = gensim.models.Word2Vec(sentences=attacker_english_input)

In [360]:
privacy_leak('I will contact my lawyer because I am not guilty',attacker_english_model,private_english_model)
print('\n')
privacy_leak('I have a cancer',attacker_english_model,private_english_model)

[('i', 0.7974109649658203)]
[('will', 0.7619689106941223)]
[('contact', 0.6188812255859375)]
[('my', 0.8155895471572876)]
[('lawyer', 0.6596115827560425)]
[('because', 0.767339825630188)]
[('i', 0.7974109649658203)]
[('am', 0.8091138005256653)]
[('not', 0.6581652164459229)]
[('guilty', 0.7783500552177429)]


[('i', 0.7974109649658203)]
[('have', 0.744394838809967)]
[('a', 0.6561230421066284)]
[('cancer', 0.7532939910888672)]


In [392]:
results=pd.DataFrame()
accuracies=[]

for topn in range(1,5):
     accuracies.append(attack_efficiency(attacker_english_model,private_english_model,topn=topn))

results['topn']=list(range(1,5))
results['Accuracy']=accuracies
print(results)

   topn  Accuracy
0     1      0.23
1     2      0.27
2     3      0.29
3     4      0.31


In [396]:
len(private_english_model.wv.vocab.keys())*0.2

3479.2000000000003

##### As we observe, this time we are talking about 20% of recovery of the dataset (which corresponds approximately here  to 3479 words). This is not negligible and as shown above, we can recover very sensitive information.

#### Situation 3 : We consider completely different datasets but that could potentially share some information. For instance, we consider again the quora dataset made of questions asked by some users and an other dataset that an attacker will use which is made of tweets published by some users.

In [397]:
quora_df=pd.read_csv('quora.csv',encoding='utf-8')
twitter_df=pd.read_csv('chat.txt',delimiter='\t',encoding='utf-8')
twitter_df.columns=['sentence']
# quora_df=quora_df.sample(frac=0.015)
private_df=quora_df.question1.values
attack_df=twitter_df.sample(frac=0.2).sentence

In [398]:
private_data=pd.DataFrame(vpunc(vlower(private_df)))
attack_data=pd.DataFrame(vpunc(vlower(attack_df)))
private_data.columns=['sentence']
attack_data.columns=['sentence']
np.random.seed(0)

private_english_data=private_data.sentence.drop_duplicates()
attacker_english_data=attack_data.sentence.drop_duplicates()

private_english_input=(private_english_data.apply(lambda x: x.split(" ")).values)
attacker_english_input=(attacker_english_data.apply(lambda x: x.split(" ")[0:10]).values)

In [400]:
private_english_model = gensim.models.Word2Vec(sentences=private_english_input)
attacker_english_model = gensim.models.Word2Vec(sentences=attacker_english_input)

In [226]:
def attack_efficiency(attack_model,private_model,topn=3,display=False):
    private_words=private_model.wv.vocab.keys()
    s=0
    for word in private_words:
        l=[]
        most_similar_words=attack_model.similar_by_vector(private_model[word], topn=topn, restrict_vocab=None)
        for prop,_ in most_similar_words:
            l.append(prop)
            
        s+= int(word in l)
        if display:
            if(word in l):
                print(word)
        
    return round(s/len(private_words),5)

In [230]:
results=pd.DataFrame()
accuracies=[]

for topn in range(1,5):
    accuracies.append(attack_efficiency(attacker_english_model,private_english_model,topn=topn))

results['topn']=list(range(1,5))
results['Accuracy']=accuracies
print(results)

   topn  Accuracy
0     1   0.00273
1     2   0.00433
2     3   0.00560
3     4   0.00633


#### This time the attack is not really efficient. However nothing tell us that it's not possible to have better results with others word embeddings that are more sophisticated. Moreover, we can still recover private information even with this poor attack as you can see above

In [406]:
privacy_leak('I like trump',attacker_english_model,private_english_model)

[('i', 0.34840452671051025)]
[('like', 0.39880162477493286)]
[('trump', 0.4790140688419342)]
