In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential

from minisom import MiniSom

import time
import pprint

In [2]:
class Data:
    # Load Adult dataset and seperate to features(X) and target(y)
    def __init__(self, path='data/adult.csv'):
        df = shuffle(pd.read_csv(path))
        df = self.clean(df)

        self.y = df.pop('income')
        self.X = df
        
        # Label encode y
        self.y_encoder = LabelEncoder()
        self.y = self.y_encoder.fit_transform(self.y)
        
        # One Hot encode X
        self.X = pd.get_dummies(self.X)
        
        for name in self.X.columns:
            if self.X[name].dtype == 'object':
                self.X[name] = self.X[name].astype('category')
    
    def clean(self, df):
        return df.replace('?', np.nan).dropna().drop('fnlwgt', axis=1)


    def train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2)
        y_train = pd.Series(y_train, index=X_train.index)
        y_test = pd.Series(y_test, index=X_test.index)
        return (X_train, X_test, y_train, y_test)


In [61]:
class TrainingModel:
    def __init__(self, input_shape):
        self.model = Sequential()
        self.model.add(Dense(64, activation='relu', input_shape=input_shape))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(128, activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(128, activation='relu'))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

    def fit(self, data, label):
        self.model.fit(data, label, epochs=1, batch_size=128, verbose=0)

    def predict(self, data):
        return self.model.predict_classes(data)
    
    def evaluate(self, X_test, y_test, print_report=True):
        y_predicted = self.predict(X_test)
        y_predicted_probs = self.model.predict_proba(X_test)
        if print_report:
            self.print_report(y_test, y_predicted, y_predicted_probs)
        else:
            accuracy = accuracy_score(y_test, y_predicted)
            report = classification_report(y_test, y_predicted, output_dict=True)
            auc_score = roc_auc_score(y_test, y_predicted_probs)
            matrix = confusion_matrix(y_test, y_predicted)

            return {
                'accuracy': accuracy,
                'auc_score': auc_score,
                **report['weighted avg'],
            }

    def print_report(self, test, predicted, predicted_probs):
        accuracy = accuracy_score(test, predicted)
        report = classification_report(test, predicted)
        matrix = confusion_matrix(test, predicted)

        print('Accuracy score: {:.5f}'.format(accuracy))
        print('-' * 20)
        print('Confusion Matrix:')
        print(matrix)
        print('-' * 20)
        print(report)
        print('-' * 20)
        print('AUC score: {:.5f}'.format(roc_auc_score(test, predicted_probs)))


In [22]:
class Perturbator:
    def perturbate(self):
        pass

In [55]:
class TrainRunner:
    def __init__(self, name, perturbators=[]):
        self.perturbators = perturbators
        self.model = None
        self.name = name

    def __str__(self):
        return self.name

    def preprocess(self, X, y):
        for perturbator in self.perturbators:
            X = perturbator.perturbate(X)
        X, y = X.align(y, join='inner', axis=0)
        return (X, y)

    def fit(self, X, y):
        shape = (X.shape[1], )
        self.model = TrainingModel(shape)
        self.model.fit(X, y)

    def evaluate(self, X, y, print_report=False):
        if self.model is not None:
            report = self.model.evaluate(X, y, print_report=print_report)
            return report
        else:
            raise Exception('Must call fit before evaluate a model.')

In [44]:
def timeit(func, *args, **kwargs):
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    return (result, end_time - start_time)

In [63]:
def profile(runners, X_train, y_train, X_test, y_test):
    runners_profile = {}
    for runner in runners:
        result, preprocess_time = timeit(runner.preprocess, X_train, y_train)
        X_preprocessed, y_preprocessed = result
        runner.fit(X_preprocessed, y_preprocessed)
        report = runner.evaluate(X_test, y_test)
        runner_profile = {
            **report,
            'preprocessed_time': preprocess_time
        }
        runners_profile[str(runner)] = runner_profile
    return runners_profile

In [24]:
class Mondrian(Perturbator):
    def __init__(self, quasi_identifiers):
        self.quasi_identifiers = quasi_identifiers
        
    def is_categorical(self, df, column):
        return str(df[column].dtype) == 'category'

    def partite(self, df, partition, column):
        """partite the df into two partitions.

            returns: A tuple of df index.
        """
        df_partition = df[column][partition]
        if self.is_categorical(df, column):
            values = list(df_partition.unique())
            left = df_partition.isin(values[:len(values) // 2])
            right = df_partition.isin(values[len(values) // 2:])
            return (df_partition[left].index, df_partition[right].index)
        else:
            median = df_partition.median()
            return (df_partition[df_partition < median].index, df_partition[df_partition >= median].index)

    def get_spans(self, df, partition):
        """get each column's span
        """
        span = {}
        for column in self.quasi_identifiers:
            df_partition = df[column][partition]
            if self.is_categorical(df, column):
                span[column] = len(df_partition.unique())
            else:
                span[column] = df_partition.max() - df_partition.min()
        return sorted(span.items(), key=lambda x: x[1], reverse=True)

    def validate(self, df):
        pass

    def split(self, df):
        wip_partitions = [df.index]
        finished_partitions = []

        while len(wip_partitions) > 0:
            partition = wip_partitions.pop(0)
            for column, _ in self.get_spans(df, partition):
                lp, rp = self.partite(df, partition, column)

                # If either left part or right part cannot satisfied the K-anonymous condition
                # cancel the partion and try next column.
                if not self.validate(df.loc[lp]) or not self.validate(df.loc[rp]):
                    continue

                # If the partition is valid, continue to try next partition.
                wip_partitions.append(lp)
                wip_partitions.append(rp)
                break
            else:
                # If the partition cannot be partited anymore, put it into finished_partitions array.
                finished_partitions.append(partition)
        return finished_partitions

    def build_dataset(self, df, partitions):
        dfs = []
        for partition in partitions:
            dfp = df.loc[partition]
            for column in self.quasi_identifiers:
                if dfp[column].dtype == 'int64':
                    dfp[column] = dfp[column].mean()
                if str(dfp[column].dtype) == 'category':
                    dfp[column] = ','.join(list(dfp[column].unique()))
            dfs.append(dfp)
        return pd.concat(dfs)
    
    def perturbate(self, df):
        partitions = self.split(df)
        return self.build_dataset(df, partitions)

In [25]:
class K_Anonymity(Mondrian):
    def __init__(self, quasi_identifiers, k):
        self.k_anonymity = k
        super().__init__(quasi_identifiers)

    def is_k_anonymous(self, partition):
        return not (partition.shape[0] < self.k_anonymity)

    def validate(self, df):
        return self.is_k_anonymous(df)
    
    def __str__(self):
        return 'K Anonymity - {}'.format(self.k_anonymity)

In [8]:
data = Data()

In [9]:
X_train_origin, X_test_origin, y_train_origin, y_test_origin = data.train_test_split()

In [168]:
runners = []

# Benchmark before perturbation

In [169]:
original_runner = TrainRunner('Original')

In [170]:
runners.append(original_runner)

# K Anonymous(KD Tree)

In [171]:
quasi_identifiers = ['age', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [172]:
sizes = [5, 10]

In [173]:
for size in sizes:
    runner = TrainRunner('K Anonymous(k={})'.format(size), [K_Anonymity(quasi_identifiers, size)])
    runners.append(runner)

# KD Tree + SOM

In [174]:
class SOM_K_Anonymity(K_Anonymity):
    def __init__(self, quasi_identifiers, k, som_size=(150, 150)):
        self.som_columns = quasi_identifiers
        self.som_size = som_size
        super().__init__(['x axis', 'y axis'], k)

    def perturbate(self, df):
        df_som = df[self.som_columns]
        som = MiniSom(self.som_size[0], self.som_size[1], df_som.shape[1])
        som.train_random(df_som.values, 10)
        coordinates = [som.winner(series.to_numpy()) for index, series in df_som.iterrows()]
        df_coordinates = pd.DataFrame(coordinates, index=df.index, columns=['x axis', 'y axis'])
        df = pd.concat([df, df_coordinates], axis=1)
        df = super().perturbate(df)
        df.drop(['x axis', 'y axis'], axis=1, inplace=True)
        return df

In [175]:
for size in sizes:
    runner = TrainRunner('SOM KDTree(k={})'.format(size), [SOM_K_Anonymity(quasi_identifiers, size)])
    runners.append(runner)

# Run All

In [176]:
pf = profile(runners, X_train_origin, y_train_origin, X_test_origin, y_test_origin)



  'precision', 'predicted', average, warn_for)




In [177]:
pd.DataFrame.from_dict(pf)

Unnamed: 0,Original,K Anonymous(k=5),K Anonymous(k=10),SOM KDTree(k=5),SOM KDTree(k=10)
accuracy,0.777667,0.773245,0.748038,0.772029,0.800995
auc_score,0.625664,0.600217,0.364495,0.822698,0.837554
f1-score,0.759078,0.736292,0.640215,0.743902,0.793218
precision,0.7573,0.748139,0.55956,0.746755,0.79032
preprocessed_time,0.002001,31.787972,22.445858,24.051003,24.318004
recall,0.777667,0.773245,0.748038,0.772029,0.800995
support,9045.0,9045.0,9045.0,9045.0,9045.0
