In [None]:
def get_spans(df, partition, scale=None):
    """
    :param        df: the dataframe for which to calculate the spans
    :param partition: the partition for which to calculate the spans
    :param     scale: if given, the spans of each column will be divided
                      by the value in `scale` for that column
    :        returns: The spans of all columns in the partition
    """
    spans = {}
    for column in df.columns:
        if column in categorical:
            span = len(df[column][partition].unique())
            #print("span-cat",span)
        else:
            span = df[column][partition].max()-df[column][partition].min()
            #print("span-num",span)
        if scale is not None:
            span = span/scale[column]
            #print("span/scale-num",span)
        spans[column] = span
    return spans
'''def get_spans(df, partition, scale=None):
    spans = {}
    for column in feature_columns:
        if df[column].dtype.name == "category":
            span = len(df[column][partition].unique())
        else:
            span = (
                df[column][partition].max() - df[column][partition].min()
            )
        if scale is not None:
            span = span / scale[column]
        spans[column] = span
    return spans'''
def split(df, partition, column):
    """
    :param        df: The dataframe to split
    :param partition: The partition to split
    :param    column: The column along which to split
    :        returns: A tuple containing a split of the original partition
    """
    dfp = df[column][partition]
    if column in categorical:
        values = dfp.unique()
        lv = set(values[:len(values)//2])
        #print("lv-cat",lv)
        rv = set(values[len(values)//2:])
        #print("rv-cat",rv)
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        #print("dfl-num",dfl)
        dfr = dfp.index[dfp >= median]
        #print("dfr-num",dfr)
        return (dfl, dfr)
def is_k_anonymous(df, partition, sensitive_column, k=3):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True

def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    """
    :param               df: The dataframe to be partitioned.
    :param  feature_columns: A list of column names along which to partition the dataset.
    :param sensitive_column: The name of the sensitive column (to be passed on to the `is_valid` function)
    :param            scale: The column spans as generated before.
    :param         is_valid: A function that takes a dataframe and a partition and returns True if the partition is valid.
    :returns               : A list of valid partitions that cover the entire dataframe.
    """
    finished_partitions = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        #print("partition",partition)
        spans = get_spans(df[feature_columns], partition, scale)
        #print("spans",spans)
        for column, span in sorted(spans.items(), key=lambda x:-x[1]):
            lp, rp = split(df, partition, column)
            #print("lp",lp)
            #print("rp",rp)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                #print("continue happened")
                continue
            #print("satisfied if here")
            partitions.extend((lp, rp))
            #print("breaking")
            break
        else:
            #print("satisfied else here")
            finished_partitions.append(partition)
            #print("appending fin part here",finished_partitions)
    return finished_partitions
def agg_categorical_column(series):
    return [','.join(set(series))]

def agg_numerical_column(series):
    return [series.mean()]
def build_anonymized_dataset(df, partitions, feature_columns, sensitive_column, max_partitions=None):
    aggregations = {}
    for column in feature_columns:
        if column in categorical:
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    rows = []
    for i, partition in enumerate(partitions):
        if i % 100 == 1:
            print("Finished {} partitions...".format(i))
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        #print("grouped_columns",grouped_columns)
        sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column: 'count'})
        #print("sensitive-columns",sensitive_counts)

        # Check if grouped_columns is a list
        if isinstance(grouped_columns, list):
            # If it's a list, iterate over each DataFrame and process
            for group_df in grouped_columns:
                values = group_df.iloc[0].to_dict()
                for sensitive_value, count in sensitive_counts[sensitive_column].items():
                    if count == 0:
                        continue
                    values.update({
                        sensitive_column: sensitive_value,
                        'count': count,
                    })
                    rows.append(values.copy())
        else:
            # If it's not a list, it's a single DataFrame, so process it directly
            #print("not list")
            values = grouped_columns.to_dict()
            for sensitive_value, count in sensitive_counts[sensitive_column].items():
                if count == 0:
                    continue
                values.update({
                    sensitive_column: sensitive_value,
                    'count': count,
                })
                rows.append(values.copy())

    return pd.DataFrame(rows)
def diversity(df, partition, column):
    return len(df[column][partition].unique())

def is_l_diverse(df, partition, sensitive_column, l=2):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l

def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max

def is_t_close(df, partition, sensitive_column, global_freqs, p=0.2):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p

**For Adult Dataset**

In [None]:

import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
# this is a list of the column names in our dataset (as the file doesn't contain any headers)
names = (
    'age',
    'workclass', #Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
)

# some fields are categorical and will require special treatment
categorical = set((
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'sex',
    'native-country',
    'race',
    'income',
))
path="/content/drive/My Drive/Data Privacy Assignment/Data Priv_dataset/Adult_test.csv"
df = pd.read_csv(path, header=None, names=names, engine='python');# We load the data using Pandas
for name in categorical:
    df[name] = df[name].astype("category")
df

Mounted at /content/drive


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


K=3

In [None]:
def is_k_anonymous(df, partition, sensitive_column, k=3):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True
full_spans = get_spans(df, df.index)
# we apply our partitioning method to two columns of our dataset, using "income" as the sensitive attribute
feature_columns = ['age', 'education', 'marital-status', 'race']
sensitive_column = 'occupation'
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)
#dfn_adult = build_anonymized_dataset(df, finished_partitions, feature_columns, sensitive_column)
#dfn_adult

In [None]:
def count_anonymity(
    df, partitions, feature_columns, sensitive_column, max_partitions=None
):
    aggregations = {}
    for column in feature_columns:
        if df[column].dtype.name == "category":
            aggregations[column] = agg_categorical_column
        else:
            aggregations[column] = agg_numerical_column
    aggregations[sensitive_column] = "count"
    rows = []
    for i, partition in enumerate(partitions):
        if max_partitions is not None and i > max_partitions:
            break
        grouped_columns = df.loc[partition].agg(aggregations, squeeze=False)
        sensitive_counts = df.loc[partition].groupby(sensitive_column).agg({sensitive_column: 'count'})

        # Check if grouped_columns is a list
        if isinstance(grouped_columns, list):
            # If it's a list, iterate over each DataFrame and process
            for group_df in grouped_columns:
                values = group_df.apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x).to_dict()
                for sensitive_value, count in sensitive_counts[sensitive_column].items():
                    if count == 0:
                        continue
                    values.update({
                        sensitive_column: sensitive_value,
                        'count': count,
                    })
                    rows.append(values.copy())
        else:
            # If it's not a list, it's a single DataFrame, so process it directly
            values = grouped_columns.apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x).to_dict()
            for sensitive_value, count in sensitive_counts[sensitive_column].items():
                if count == 0:
                    continue
                values.update({
                    sensitive_column: sensitive_value,
                    'count': count,
                })
                rows.append(values.copy())

    return rows
dfn_ad_count=count_anonymity(df, finished_partitions, feature_columns, sensitive_column)
sort_dfn_cnt = sorted(dfn_ad_count, key=lambda x: x['age'])
sort_dfn_cnt[:5]

[{'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': '?',
  'count': 29},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Adm-clerical',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Craft-repair',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Farming-fishing',
  'count': 4},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Handlers-cleaners',
  'count': 8}]

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KDTree
#!pip install category_encoders
import category_encoders as ce

class RecordLinkage:
    def __init__(self, df, knowledge):
        self.df = df
        self.knowledge = knowledge

        categories = (df.dtypes == "object").keys().to_list()
        self.enc = ce.OneHotEncoder(cols=categories, drop_invariant=False)
        df_concat = pd.concat([self.df, self.knowledge], ignore_index=True)
        self.enc.fit(df_concat)

    def execute(self, k=3):
        enc_df = self.enc.transform(self.df).astype("float64").values
        enc_knowledge = self.enc.transform(self.knowledge).astype("float64").values

        tree = KDTree(enc_df)
        dist, index = tree.query(enc_knowledge, k=k)
        return dist, index


def attack(df, knowledge):
    k = 3
    a = RecordLinkage(df, knowledge)
    ab=[]
    dist, index = a.execute(k)

    di = pd.DataFrame(np.hstack((index, dist)))
    #print(di)

    di.loc[di[3] > di[3].median(), :] = -1
    #print(di[3].median())
    # Display the top three
    ab.append(di.iloc[:, 0:k].astype(int))
    return di.iloc[:, 0:k].astype(int),ab

path1="/content/drive/My Drive/Data Privacy Assignment/adult_attack2.csv"
dfa_attack = pd.read_csv(path1)

# Anonymized data
dfad_cnt_k3 = pd.DataFrame(sort_dfn_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dfad_cnt_k3, knowledge)
print(rl)

link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    flag=0
    for k in range(4):
      if(x!=-1):
        if((dfad_cnt_k3.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;
        else:
          flag=1

print(link_cnt)

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3
        0      1      2
0      -1     -1     -1
1    1204   1203   1208
2    2015   2013   2014
3   10969  10967  10968
4    6718   6716   6717
5   11055  11053  11054
6     239    237    238
7      -1     -1     -1
8   11015  11013  11014
9     138    135    136
10    108    110    109
27


K=3;L=2

In [None]:
def diversity(df, partition, column):
    return len(df[column][partition].unique())

def is_l_diverse(df, partition, sensitive_column, l=2):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
# now let's apply this method to our data and see how the result changes
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])

In [None]:
# Anonymized data
dflad_cnt_k3l2 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k3l2, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k3l2.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

        0      1      2
0      -1     -1     -1
1    1207   1216   1203
2    2007   2006   2008
3   10913  10911  10912
4    6690   6688   6689
5   10997  10995  10996
6     239    237    238
7      -1     -1     -1
8   10958  10956  10957
9     138    135    136
10    110    108    109
27


K=3;L=4

In [None]:
def diversity(df, partition, column):
    return len(df[column][partition].unique())

def is_l_diverse(df, partition, sensitive_column, l=4):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
# now let's apply this method to our data and see how the result changes
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])

# Anonymized data
dflad_cnt_k3l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k3l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k3l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

        0      1      2
0      -1     -1     -1
1    1127   1128   1118
2    1908   1906   1909
3      -1     -1     -1
4    6263   6265   6262
5      -1     -1     -1
6     269    265    270
7      -1     -1     -1
8   10140  10137  10139
9      88    133     87
10    109    104    103
21


K=3;L=12

In [None]:
def diversity(df, partition, column):
    return len(df[column][partition].unique())

def is_l_diverse(df, partition, sensitive_column, l=12):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
# now let's apply this method to our data and see how the result changes
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])

# Anonymized data
dflad_cnt_k3l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k3l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k3l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    98  115   99
1   403  449  402
2   827  824  825
3    98  115   99
4    98  115   99
5    98  115   99
6   235  234  233
7    98  115   99
8    98  115   99
9    79   77   78
10   98  115   99
12


K=3;P=0.15

In [None]:
# here we generate the global frequencies for the sensitive column
global_freqs = {}
total_count = float(len(df))
group_counts = df.groupby(sensitive_column)[sensitive_column].agg('count')
for value, count in group_counts.to_dict().items():
    p = count/total_count
    global_freqs[value] = p

def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max

def is_t_close(df, partition, sensitive_column, global_freqs, p=0.15):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p

# Let's apply this to our dataset
finished_t_close_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_t_close(*args, global_freqs))
dfp_ad_count=count_anonymity(df, finished_t_close_partitions, feature_columns, sensitive_column)
sort_dfp_cnt = sorted(dfp_ad_count, key=lambda x: x['age'])

In [None]:
# Anonymized data
dfpad_cnt_k3p02 = pd.DataFrame(sort_dfp_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dfpad_cnt_k3p02, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfpad_cnt_k3p02.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     82   100    87
1    357   356   355
2    699   708   703
3     82   100    87
4   2570  2569  2568
5     82   100    87
6    341   344   342
7     82   100    87
8     82   100    87
9     52    45    53
10    82   100    87
15


K=10

In [None]:
def is_k_anonymous(df, partition, sensitive_column, k=10):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)
dfn_ad_count=count_anonymity(df, finished_partitions, feature_columns, sensitive_column)
sort_dfn_cnt = sorted(dfn_ad_count, key=lambda x: x['age'])
sort_dfn_cnt[:5]

[{'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': '?',
  'count': 29},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Adm-clerical',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Craft-repair',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Farming-fishing',
  'count': 4},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Handlers-cleaners',
  'count': 8}]

In [None]:
# Anonymized data
dfad_cnt_k10 = pd.DataFrame(sort_dfn_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dfad_cnt_k10, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfad_cnt_k10.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    968   970   967
2   1685  1686  1678
3     -1    -1    -1
4   5521  5535  5520
5     -1    -1    -1
6    371   369   370
7     -1    -1    -1
8     -1    -1    -1
9    120   130   118
10    96    98    97
18


L=2;K=10

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=2):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k10l2 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k10l2, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k10l2.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    968   974   967
2   1686  1678  1682
3     -1    -1    -1
4   5518  5531  5517
5     -1    -1    -1
6    375   377   376
7     -1    -1    -1
8     -1    -1    -1
9    118   129   122
10   214   213   212
18


L=4,k=10

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=4):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k10l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]

# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k10l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k10l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    972   974   973
2   1681  1690  1680
3     -1    -1    -1
4   5520  5544  5519
5     -1    -1    -1
6    376   375   374
7     -1    -1    -1
8     -1    -1    -1
9    121   136   122
10   213   212   216
18


K=10;L=12

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=12):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k20l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
#print(dflad_cnt_k20l4.head(50))
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k20l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k20l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    98  115   99
1   403  449  402
2   827  824  825
3    98  115   99
4    98  115   99
5    98  115   99
6   235  234  233
7    98  115   99
8    98  115   99
9    79   77   78
10   98  115   99
12


k=10;p=0.1

In [None]:
# here we generate the global frequencies for the sensitive column
global_freqs = {}
total_count = float(len(df))
group_counts = df.groupby(sensitive_column)[sensitive_column].agg('count')
for value, count in group_counts.to_dict().items():
    p = count/total_count
    global_freqs[value] = p

def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max

def is_t_close(df, partition, sensitive_column, global_freqs, p=0.15):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p

# Let's apply this to our dataset
finished_t_close_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_t_close(*args, global_freqs))
dfp_ad_count=count_anonymity(df, finished_t_close_partitions, feature_columns, sensitive_column)
sort_dfp_cnt = sorted(dfp_ad_count, key=lambda x: x['age'])
# Anonymized data
dfpad_cnt_k10p02 = pd.DataFrame(sort_dfp_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dfpad_cnt_k10p02, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfpad_cnt_k10p02.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     85   122    91
1    355   357   356
2    699   700   697
3     85   122    91
4   2547  2545  2546
5     85   122    91
6    352   354   353
7     85   122    91
8     85   122    91
9     51    53    52
10    85   122    91
15


K=20

In [None]:
def is_k_anonymous(df, partition, sensitive_column, k=20):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)
dfn_ad_count=count_anonymity(df, finished_partitions, feature_columns, sensitive_column)
sort_dfn_cnt = sorted(dfn_ad_count, key=lambda x: x['age'])
sort_dfn_cnt[:5]

[{'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': '?',
  'count': 29},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Adm-clerical',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Craft-repair',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Farming-fishing',
  'count': 4},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Handlers-cleaners',
  'count': 8}]

In [None]:
# Anonymized data
dfad_cnt_k20 = pd.DataFrame(sort_dfn_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dfad_cnt_k20, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfad_cnt_k20.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    758   756   757
2   1311  1308  1306
3     -1    -1    -1
4   4309  4307  4308
5     -1    -1    -1
6    449   441   440
7     -1    -1    -1
8     -1    -1    -1
9    168   169   171
10   184   186   183
18


K=20;L=2

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=2):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k20l2 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k20l2, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k20l2.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    758   756   757
2   1306  1307  1310
3     -1    -1    -1
4   4308  4306  4307
5     -1    -1    -1
6    443   450   449
7     -1    -1    -1
8     -1    -1    -1
9    174   172   171
10   182   185   184
18


k=20;L=4

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=4):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k20l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
#print(dflad_cnt_k20l4.head(50))
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k20l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k20l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    759   756   757
2   1308  1311  1310
3     -1    -1    -1
4   4307  4305  4306
5     -1    -1    -1
6    442   446   443
7     -1    -1    -1
8     -1    -1    -1
9    171   168   169
10   183   185   184
18


K=20;L=12

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=12):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k20l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
#print(dflad_cnt_k20l4.head(50))
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k20l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k20l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    98  115   99
1   403  449  402
2   827  824  825
3    98  115   99
4    98  115   99
5    98  115   99
6   235  234  233
7    98  115   99
8    98  115   99
9    79   77   78
10   98  115   99
12


K=20;p=0.15

In [None]:
# here we generate the global frequencies for the sensitive column
global_freqs = {}
total_count = float(len(df))
group_counts = df.groupby(sensitive_column)[sensitive_column].agg('count')
for value, count in group_counts.to_dict().items():
    p = count/total_count
    global_freqs[value] = p

def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max

def is_t_close(df, partition, sensitive_column, global_freqs, p=0.15):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p

# Let's apply this to our dataset
finished_t_close_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_t_close(*args, global_freqs))
dfp_ad_count=count_anonymity(df, finished_t_close_partitions, feature_columns, sensitive_column)
sort_dfp_cnt = sorted(dfp_ad_count, key=lambda x: x['age'])
dfpad_cnt_k20p02 = pd.DataFrame(sort_dfp_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]

knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]
rl,ab = attack(dfpad_cnt_k20p02, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfpad_cnt_k20p02.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     83    93    92
1    352   349   351
2    666   669   667
3     83    93    92
4   2393  2395  2394
5     83    93    92
6    337   339   345
7     83    93    92
8     83    93    92
9     43    41    42
10    83    93    92
15


K=50

In [None]:
def is_k_anonymous(df, partition, sensitive_column, k=50):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)
dfn_ad_count=count_anonymity(df, finished_partitions, feature_columns, sensitive_column)
sort_dfn_cnt = sorted(dfn_ad_count, key=lambda x: x['age'])
sort_dfn_cnt[:5]

[{'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': '?',
  'count': 29},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Adm-clerical',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Craft-repair',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Farming-fishing',
  'count': 4},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Handlers-cleaners',
  'count': 8}]

In [None]:
# Anonymized data
dfad_cnt_k50 = pd.DataFrame(sort_dfn_cnt).loc[:,  ['age', 'education', 'marital-status', 'race']]
knowledge = dfa_attack.loc[:,  ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dfad_cnt_k50, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfad_cnt_k50.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    451   452   450
2    886   889   888
3     -1    -1    -1
4   2756  2754  2755
5     -1    -1    -1
6    429   416   428
7     -1    -1    -1
8     -1    -1    -1
9    114   132   108
10  3461  3459  3460
18


K=50;l=2

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=2):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k50l2 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k50l2, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k50l2.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    451   452   450
2    886   889   888
3     -1    -1    -1
4   2756  2754  2755
5     -1    -1    -1
6    429   416   428
7     -1    -1    -1
8     -1    -1    -1
9    114   132   108
10  3461  3459  3460
18


K=50;L=4

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=4):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k50l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k50l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k50l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     -1    -1    -1
1    452   463   451
2    888   894   889
3     -1    -1    -1
4   2763  2761  2762
5     -1    -1    -1
6    410   407   406
7     -1    -1    -1
8     -1    -1    -1
9    112   138   111
10  3468  3466  3467
18


k=50;L=12

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=12):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k50l12 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k50l12, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k50l12.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    91   98   77
1   405  418  394
2   757  751  752
3    91   98   77
4    91   98   77
5    91   98   77
6   234  238  233
7    91   98   77
8    91   98   77
9    82   90   77
10   91   98   77
12


k=50;p=0.15

In [None]:
# here we generate the global frequencies for the sensitive column
global_freqs = {}
total_count = float(len(df))
group_counts = df.groupby(sensitive_column)[sensitive_column].agg('count')
for value, count in group_counts.to_dict().items():
    p = count/total_count
    global_freqs[value] = p

def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max

def is_t_close(df, partition, sensitive_column, global_freqs, p=0.15):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p

# Let's apply this to our dataset
finished_t_close_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_t_close(*args, global_freqs))
dfp_ad_count=count_anonymity(df, finished_t_close_partitions, feature_columns, sensitive_column)
sort_dfp_cnt = sorted(dfp_ad_count, key=lambda x: x['age'])
dfpad_cnt_k50p02 = pd.DataFrame(sort_dfp_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]

knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]
rl,ab = attack(dfpad_cnt_k50p02, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfpad_cnt_k50p02.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0    121   119   120
1    349   310   311
2    586   599   572
3    121   119   120
4   1915  1913  1914
5    121   119   120
6    307   296   297
7    121   119   120
8    121   119   120
9     56    54    55
10   121   119   120
15


K=100

In [None]:
def is_k_anonymous(df, partition, sensitive_column, k=100):
    """
    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True
finished_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, is_k_anonymous)
dfn_ad_count=count_anonymity(df, finished_partitions, feature_columns, sensitive_column)
sort_dfn_cnt = sorted(dfn_ad_count, key=lambda x: x['age'])
sort_dfn_cnt[:5]

[{'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': '?',
  'count': 29},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Adm-clerical',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Craft-repair',
  'count': 2},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Farming-fishing',
  'count': 4},
 {'age': 17.0,
  'education': '10th',
  'marital-status': 'Never-married',
  'race': 'White',
  'occupation': 'Handlers-cleaners',
  'count': 8}]

In [None]:
# Anonymized data
dfad_cnt_k100 = pd.DataFrame(sort_dfn_cnt).loc[:,  ['age', 'education', 'marital-status', 'race']]
knowledge = dfa_attack.loc[:,  ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dfad_cnt_k100, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfad_cnt_k100.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    76  126  114
1   353  355  354
2   593  591  592
3    76  126  114
4    76  126  114
5    76  126  114
6   326  315  320
7    76  126  114
8    76  126  114
9    95   94   93
10    8   10    9
15


k=100;L=2

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=2):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k100l2 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k100l2, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k100l2.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    76  126  114
1   353  355  354
2   593  591  592
3    76  126  114
4    76  126  114
5    76  126  114
6   326  315  320
7    76  126  114
8    76  126  114
9    95   94   93
10    8   10    9
15


K=100;L=4

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=4):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k100l4 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k100l4, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k100l4.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    76  126  114
1   353  355  354
2   593  591  592
3    76  126  114
4    76  126  114
5    76  126  114
6   326  315  320
7    76  126  114
8    76  126  114
9    95   94   93
10    8   10    9
15


K=100;L=12

In [None]:
def is_l_diverse(df, partition, sensitive_column, l=12):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param                l: The minimum required diversity of sensitive attribute values in the partition
    """
    return diversity(df, partition, sensitive_column) >= l
finished_l_diverse_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_l_diverse(*args))
dfl_ad_count=count_anonymity(df, finished_l_diverse_partitions, feature_columns, sensitive_column)
sort_dfl_cnt = sorted(dfl_ad_count, key=lambda x: x['age'])
# Anonymized data
dflad_cnt_k100l12 = pd.DataFrame(sort_dfl_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]
# this is attackers knowledge
knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]

rl,ab = attack(dflad_cnt_k100l12, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dflad_cnt_k100l12.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

      0    1    2
0    96  122   66
1   335  337  336
2   562  563  561
3    96  122   66
4    96  122   66
5    96  122   66
6   313  317  315
7    96  122   66
8    96  122   66
9    79   77   78
10   96  122   66
12


K=100;p=0.15

In [None]:
# here we generate the global frequencies for the sensitive column
global_freqs = {}
total_count = float(len(df))
group_counts = df.groupby(sensitive_column)[sensitive_column].agg('count')
for value, count in group_counts.to_dict().items():
    p = count/total_count
    global_freqs[value] = p

def t_closeness(df, partition, column, global_freqs):
    total_count = float(len(partition))
    d_max = None
    group_counts = df.loc[partition].groupby(column)[column].agg('count')
    for value, count in group_counts.to_dict().items():
        p = count/total_count
        d = abs(p-global_freqs[value])
        if d_max is None or d > d_max:
            d_max = d
    return d_max

def is_t_close(df, partition, sensitive_column, global_freqs, p=0.15):
    """
    :param               df: The dataframe for which to check l-diversity
    :param        partition: The partition of the dataframe on which to check l-diversity
    :param sensitive_column: The name of the sensitive column
    :param     global_freqs: The global frequencies of the sensitive attribute values
    :param                p: The maximum allowed Kolmogorov-Smirnov distance
    """
    if not sensitive_column in categorical:
        raise ValueError("this method only works for categorical values")
    return t_closeness(df, partition, sensitive_column, global_freqs) <= p

# Let's apply this to our dataset
finished_t_close_partitions = partition_dataset(df, feature_columns, sensitive_column, full_spans, lambda *args: is_k_anonymous(*args) and is_t_close(*args, global_freqs))
dfp_ad_count=count_anonymity(df, finished_t_close_partitions, feature_columns, sensitive_column)
sort_dfp_cnt = sorted(dfp_ad_count, key=lambda x: x['age'])
dfpad_cnt_k50p02 = pd.DataFrame(sort_dfp_cnt).loc[:, ['age', 'education', 'marital-status', 'race']]

knowledge = dfa_attack.loc[:, ['age', 'education', 'marital-status', 'race']]
rl,ab = attack(dfpad_cnt_k50p02, knowledge)
print(rl)
link_cnt=0
for i in range(11):
  for j in range(3):
    x=ab[0][j][i]
    for k in range(4):
      if(x!=-1):
        if((dfpad_cnt_k50p02.loc[x][feature_columns[k]])== knowledge.loc[i][feature_columns[k]]):
          #print(dfad_cnt_k3.loc[x])
          #print(knowledge.loc[i])
          link_cnt+=1;

print(link_cnt)

       0     1     2
0     72    97    87
1    254   245   246
2    441   434   446
3     72    97    87
4   1374  1372  1373
5     72    97    87
6    235   232   231
7     72    97    87
8     72    97    87
9     47    46    50
10    72    97    87
15
