# Anonymize data with tabular anonymizer

Import libraries.

In [None]:
from tabular_anonymizer import DataFrameAnonymizer
import pandas as pd
!pip install matplotlib
import matplotlib.pyplot as plt

Read data. The dataset is Adult Census data. Label is income information, where 0 corresponds to < 50K $ 

per year, and 1 corresponds to >= 50K $.

In [None]:
file1 = "./data/adult.csv"
df = pd.read_csv(file1, sep=",", index_col=0)
df.reset_index()
df.index = range(len(df))
df.head()


For the purposes of this demonstration, we drop some columns and rows.

In [None]:
# Keep only 1000 rows
df = df.loc[0:999,]
df.shape 

In [None]:
# Drop some columns
df = df.drop(columns = ["fnlwgt", "education-num", "occupation", "race", "native-country"])
df.head(10)

In [None]:
# Setup Sensitive attributes you dont want to alter
sensitive_columns = ['label']
# Setup columns that are generalized with average, not interval
avg_columns = ['age']

# Set k
k = 10

Run the anonymizer.

In [None]:
print("Run tabular_anonymizer. Sensitive columns: ", sensitive_columns, ", k=", k)
p = DataFrameAnonymizer(df, sensitive_columns, avg_columns=avg_columns)

In [None]:
# New anonymized dataframe is formed
df_anonymized = p.anonymize_k_anonymity(k=k)
df_anonymized.head()

Write anonymized dataframe to file.

If you want to open the files in Excel, use as separator something other than ",", for example ";".

In [None]:
# Original small data to csv
df.to_csv("./data/adult-small.csv", index=False)
# Anonymized dataframe to csv
df_anonymized.to_csv("./data/adult-anonymized.csv", index=False)#, sep=";")

## Example: Generating semi-synthetic tidy data from the Mondrian-k -anonymized data

You may have noticed, that the Mondrian-k -anonymized data is no longer tidy (each row contains an observation, each cell contains a single value). 
Untidy data is difficult to utilize for analysis and ML. 
Let's generate tidy samples from the anonymized dataset by bootstrapping.
Note that this data is now synthetic: the data points do not anymore link to individuals. However, the dataset should have similar statistical properties as the original one.

In [None]:
import numpy as np
# new dataframe for storing generated data
bootstrap_c = 1 # constant for tuning bootstrapping
replace_nan_with = 0
# amount of new data to be generated = bootstrap_c * N
float_range = []
int_range = ['capital-gain', 'capital-loss', 'hours-per-week']
numeric = ['label', 'age']
category_list = ['workclass', 'education', 'marital-status', 'relationship', 'sex']
colnames = df.columns
df_gen = []#pd.DataFrame(columns = colnames)
df_gen
# iterate through anonymized data
for _, row in df_anonymized.iterrows():
    # generate new samples from each row
    # tune bootstrap_c for bootstrapping
    for _ in range(int(row['label_count']*bootstrap_c)):
        new_row = {} # new sample
        for colname in colnames: # go through columns
            value = replace_nan_with
            if colname in numeric: # just take the value as-is
                value = row[colname]
            elif colname in float_range: # sample from value range
                value_range = np.array(row[colname][0].split('-')).astype(int)
                if value_range.shape[0] == 2:
                    value = np.random.rand()*(value_range[1]-value_range[0]) + value_range[0]
                else: pass
            elif colname in int_range: # sample from value range
                value_range = np.array(row[colname][0].split('-')).astype(int)
                if value_range.shape[0] == 2:
                    value = np.random.choice(np.arange(value_range[0],value_range[1]))
                else: pass
            elif colname in category_list: # sample from list of alternatives
                if len(row[colname][0].split(',')) > 0:
                    value = np.random.choice(row[colname][0].split(','))
                else: pass
            else:
                pass
            new_row[colname] = value
        #print(new_row)
        df_gen.append(new_row)
df_gen = pd.DataFrame(df_gen, columns=df_anonymized.columns).drop('label_count', axis = 1)
df_gen = df_gen.sample(frac = 1)

# Exercise:
# You can try plotting the old and new data
# what happens to the data quality?


In [None]:
df_gen

## Exercises: Anonymization and data quality

1. Try plotting the data in a representative way before (df) and after anonymization & bootstrapping (df_gen). Try changing the setup: N, k, bootstrap_c, numerical feature treatment (avg / range). What happens?

2. Try fitting some ML model to the data or part of it before and after anonymization & bootstrapping. What happens?

3. Can you explain how anonymization can affect data & model quality?


### 1. Plotting the data (some examples)

In [None]:
def plot_sorted_line(ori, gen, column_name, ax):
    """
    line plot of sorted data, original and generated from anonymized
    """
    # original
    df[column_name].sort_values().reset_index(drop = True).plot(ax = ax, label = 'original')
    # anonymized
    df_gen[column_name].sort_values().reset_index(drop = True).plot(ax = ax, label = 'anonymized')
    ax.legend(title = column_name)
    return ax

def plot_scatter(ori, gen, xcol, ycol, ax):
    """
    scatter plot of two columns of the data, original and generated
    """
    # original
    df[[xcol, ycol]].plot(ax = ax, x = xcol, y = ycol, kind = 'scatter', label = 'original', color = 'b', marker = 'o', alpha = 0.5)
    # anonymized
    df_gen[[xcol, ycol]].plot(ax = ax, x = xcol, y = ycol, kind = 'scatter', label = 'anonymized', color = 'r', marker = 'x', alpha = 0.5)
    ax.legend(title = xcol + '-' + ycol)
    return ax

In [None]:
nrow = 2
ncol = 2
fig, axs = plt.subplots(nrow, ncol, figsize = (ncol*8, nrow*5),constrained_layout = True)
# cap-loss
plot_sorted_line(df, df_gen, 'capital-loss', ax = axs[0,0])
# cap-gain
plot_sorted_line(df, df_gen, 'capital-gain', ax = axs[0,1])

# hours per week
plot_sorted_line(df, df_gen, 'hours-per-week', ax = axs[1,0]) # capital gain!

# age
plot_sorted_line(df, df_gen, 'age', ax = axs[1,1])

In [None]:
nrow = 4
ncol = 4
fig, axs = plt.subplots(nrow, ncol, figsize = (ncol*4, nrow*4),constrained_layout = True)
plot_by = ['age', 'hours-per-week', 'capital-loss', 'capital-gain']

for i in range(4):
    for j in range(4):
        if i == j:
            pass
        else:
            xcol = plot_by[i]
            ycol = plot_by[j]
            plot_scatter(df, df_gen, xcol, ycol, ax = axs[i,j])

