# Dataset anonymization

##### Import lib for data-working

In [2]:
import pandas as pd

##### Read input file

In [3]:
filename = 'dataset1m.xlsx' # <- filename
df = pd.read_excel(filename)

##### Flags

In [4]:
ANON_JOBS = True
ANON_ADDRESS = True
ANON_SALARY = True
LOCAL_SUPRESS = True

if LOCAL_SUPRESS:
    REQUIRED_K_ANON = 5

SAVE_ANON_FILE = False

##### Anonymization funcs

In [5]:
# Salary anonymization
def salary_local_generalization(x):
    val = int(x)
    if val > 100000:
        return "Very high"
    elif val > 75000:
        return "High"
    elif val > 55000:
        return "Mid"
    else:
        return "Low"

# Name anonymize
def del_attrib(x):
    return '***'

# Address anonymize
def addr_mask(x):
    # all addresses contain " д. "
    return str(x.split(" д. ")[0]) + ' ***'

# Job anonymize
def job_local_generalization(x):
    s = str(x)
    if x == "Административный директор" or x == "Директор по маркетингу" or x == "Финансовый директор" or x == "Генеральный директор":
        return "Директор"
    elif x == "Водитель" or x == "Комендант" or x == "Охранник" or x == "Уборщик" or x == "Секретарь" or x == "Бухгалтер":
        return "Вспомогательный персонал"
    else:
        return s

# Phone anonymize
def phone_local_generalization(x):
    s = str(x)
    if int(s[1:4]) in [929, 921, 931]:
        return "Megafon"
    elif int(s[1:4]) in [911, 981]:
        return "MTS"
    elif int(s[1:4]) in [961, 962, 963, 964, 903, 905, 906, 909, 960]:
        return "Beeline"
    elif int(s[1:4]) in [901, 952, 904, 950, 951]:
        return "Tele2"

if ANON_SALARY:
    df["Заработная плата"] = df["Заработная плата"].apply(salary_local_generalization)

# Direct identifier - always anonymize
df["ФИО работника"] = df["ФИО работника"].apply(del_attrib)


if ANON_ADDRESS:
    df["Адрес работы"] = df["Адрес работы"].apply(addr_mask)

if ANON_JOBS:
    df["Должность"] = df["Должность"].apply(job_local_generalization)

# Direct identifier - always anonymize
df["Номер телефона"] = df["Номер телефона"].apply(phone_local_generalization)

#### Get dictionary [field : frequency] and k-anonymity

In [6]:
def get_dict_and_k_anonimity():
    # my_dictionary is used to count k-anonymity
    my_dictionary = {}
    # key is concatenated row
    def get_row_as_string(ind):
        return str(df["ФИО работника"].iloc[ind]) +\
               str(df["Номер телефона"].iloc[ind]) +\
               str(df["Адрес работы"].iloc[ind]) +\
               str(df["Должность"].iloc[ind]) +\
               str(df["Заработная плата"].iloc[ind])
    for row in range(df.shape[0]):
        s = get_row_as_string(row)
        if s in my_dictionary:
            my_dictionary[s] += 1
        else:
            my_dictionary[s] = 1
    return my_dictionary, min(my_dictionary.values())

##### List of strings of each row

In [7]:
# return list of rows as single string
strings = [''.join(val) for val in df.astype(str).values.tolist()]

#### Output before local suppression

In [8]:
mydict, k_anon = get_dict_and_k_anonimity()

print("k-anonymity = ", k_anon)

# sorted dictionary = list of tuples as [(key, value) for key,value in dict]
sorted_mydict = sorted(mydict.items(), key=lambda x:x[1])
print("Unique rows:", len(mydict))
print("5 least frequent rows:")

# line below extracts from dataframe 5 least frequent lines (because the corresponding keys are strings) and display them as dataframe
display(df.loc[[strings.index(sorted_mydict[i][0]) for i in range(5)]])
print("Frequencies : ", [sorted_mydict[i][1] for i in range(5)])

<class 'dict'>
k-anonymity =  2
<class 'list'>
Unique rows: 4803
5 least frequent rows:


Unnamed: 0,ФИО работника,Номер телефона,Адрес работы,Должность,Заработная плата
641053,***,Tele2,Обводного канала наб. ***,Программист,High
135729,***,Tele2,Обводного канала наб. ***,Программист,Very high
64706,***,Beeline,пр-кт Малый В.О. ***,Программист,Very high
111263,***,MTS,пр-кт Малый В.О. ***,Программист,Very high
464525,***,Beeline,Обводного канала наб. ***,Программист,Very high


Frequencies :  [2, 4, 5, 5, 5]


##### Local suppress

In [10]:
if LOCAL_SUPRESS:
    forDelete = []

    for index in range(df.shape[0]):
        line = strings[index]
        if mydict[line] < REQUIRED_K_ANON:
            #print(line, ">>", index)
            forDelete.append(index)
    print(len(forDelete), " lines need to be suppressed to meet all requirements")

6  lines need to be suppressed to meet all requirements


#### Output after local suppress

In [11]:
if LOCAL_SUPRESS:
    df = df.drop(df.index[forDelete])
    new_dict, new_k_anon = get_dict_and_k_anonimity()
    print("New k-anon = ", new_k_anon)
    print("New unique rows:", len(new_dict))
    sorted_new_dict = sorted(new_dict.items(), key=lambda x:x[1])
    strings = [''.join(val) for val in df.astype(str).values.tolist()]
    print("New 5 least frequent rows:")
    display(df.loc[[strings.index(sorted_new_dict[i][0]) for i in range(5)]])
    print("Frequencies : ", [sorted_new_dict[i][1] for i in range(5)])

New k-anon =  5
New unique rows: 4801
New 5 least frequent rows:


Unnamed: 0,ФИО работника,Номер телефона,Адрес работы,Должность,Заработная плата
64706,***,Beeline,пр-кт Малый В.О. ***,Программист,Very high
111263,***,MTS,пр-кт Малый В.О. ***,Программист,Very high
464524,***,Megafon,пр-кт Невский ***,Строитель,Mid
65407,***,Tele2,ул. Разъезжая ***,Автослесарь,Mid
80577,***,Tele2,ул. Потёмкинская ***,Программист,Very high


Frequencies :  [5, 5, 5, 6, 6]


##### AVG k-anonymity calculation

In [12]:
sum_of_frequencies = 0
def get_dict():
    if LOCAL_SUPRESS:
        return new_dict
    else:
        return mydict
for key in get_dict():
    sum_of_frequencies += get_dict()[key]
print("AVG k-anonymity = ", round(sum_of_frequencies / len(get_dict()), 2))

AVG k-anonymity =  207.79


#### Save anonymized dataset as .xlsx

In [80]:
path = filename.split('.')[0] + 'anonymized.xlsx'

if SAVE_ANON_FILE:
    df.to_excel(path, index = False)