# Anonymize data with tabular anonymizer

Import libraries.

In [None]:
from tabular_anonymizer import DataFrameAnonymizer
import pandas as pd

Read data. The dataset is Adult Census data. Label is income information, where 0 corresponds to < 50K $ 

per year, and 1 corresponds to >= 50K $.

In [None]:
file1 = "./data/adult.csv"
df = pd.read_csv(file1, sep=",", index_col=0)
df.reset_index()
df.index = range(len(df))
df.head()


For the purposes of this demonstration, we drop some columns and rows.

In [None]:
# Keep only 1000 rows
df = df.loc[0:999,]
df.shape 

In [None]:
# Drop some columns
df = df.drop(columns = ["fnlwgt", "education-num", "occupation", "race", "native-country"])
df.head()

In [None]:
# Setup Sensitive attributes you dont want to alter
sensitive_columns = ['label']
# Setup columns that are generalized with average, not interval
avg_columns = ['capital-gain', 'capital-loss']

# Set k
k = 10

Run the anonymizer.

In [None]:
print("Run tabular_anonymizer. Sensitive columns: ", sensitive_columns, ", k=", k)
p = DataFrameAnonymizer(df, sensitive_columns, avg_columns=avg_columns)

In [None]:
# New anonymized dataframe is formed
df_anonymized = p.anonymize_k_anonymity(k=k)
df_anonymized.head(20)

Write anonymized dataframe to file.

If you want to open the files in Excel, use as separator something other than ",", for example ";".

In [None]:
# Original small data to csv
df.to_csv("./data/adult-small.csv", index=False)
# Anonymized dataframe to csv
df_anonymized.to_csv("./data/adult-anonymized.csv", index=False)#, sep=";")

## Example: Generating semi-synthetic tidy data from the Mondrian-k -anonymized data

You may have noticed, that the Mondrian-k -anonymized data is no longer tidy (each row contains an observation, each cell contains a single value). 
Untidy data is difficult to utilize for analysis and ML. 
Let's generate tidy samples from the anonymized dataset by bootstrapping.
Note that this data is now synthetic: the data points do not anymore link to individuals. However, the dataset should have similar statistical properties as the original one.

In [None]:
# import numpy as np
# new dataframe for storing generated data
bootstrap_c = 1 # constant for tuning bootstrapping
value_range = ['age', 'hours-per-week']
numeric = ['capital-gain', 'capital-loss', 'label']
category_list = ['workclass', 'education', 'marital-status', 'relationship', 'sex']
colnames = df_anonymized.columns
df_gen = []#pd.DataFrame(columns = colnames)
df_gen
# iterate through anonymized data
for _, row in df_anonymized.head(10).iterrows(): # just 10 rows for example, you can try increasing the generated data
    # generate new samples from each row
    # tune bootstrap_c for bootstrapping
    for _ in range(int(row['label_count']*bootstrap_c)):
        new_row = [] # new sample
        for colname in colnames: # go through columns
            if colname in numeric: # just take the value as-is
                value = row[colname]
            elif colname in value_range: # sample from value range
                value_range = np.array(row[colname][0].split('-')).astype(int)
                value = np.random.rand()*(value_range[1]-value_range[0])+value_range[0]
            elif colname in category_list: # sample from list of alternatives
                value = np.random.choice(row[colname][0].split(','))
            else:
                # define alternative strategies
                pass
            new_row.append(value)
        #print(new_row)
        df_gen.append(new_row)
df_gen = pd.DataFrame(df_gen, columns=df_anonymized.columns)
df_gen = df_gen.sample(frac = 1)

# Exercise:
# You can try plotting the old and new data
# what happens to the data quality?


## Exercises: Anonymization and data quality

1. Try plotting the data in a representative way before (df) and after anonymization & bootstrapping (df_gen). What happens?

2. Try fitting some ML model to the data or part of it before and after anonymization & bootstrapping. What happens?

3. Can you explain how anonymization can affect data & model quality?
