In [2]:
import sys
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

In [3]:
from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

In [4]:
import numbers
import json
#'pip install git+git://github.com/clintval/gender_predictor.git'
from gender_predictor import GenderPredictor
import itertools

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [5]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Downloading data directory.
Downloading 1-g0zbKFAXz7zKZc0Dnh74uDBpZCv4YqU into ../data/data.zip... 
3.6 GiB iB                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Done.
Unzipping...Done.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.np

# Read input dataset

In [11]:
tmp_samples = pd.read_csv('adult.csv')

In [13]:
tmp_samples

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


# Pandas Data Profiling

In [47]:
profile = ProfileReport(tmp_samples, title='Dataset Report', explorative=True)

In [49]:
# As a json file
profile.to_file("dataset_report.json")

HBox(children=(HTML(value='Render JSON'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




# Test whether the input dataset contains header

In [17]:
def identify_header(path, n=5, th=0.9):
    df1 = pd.read_csv(path, header='infer', nrows=n)
    df2 = pd.read_csv(path, header=None, nrows=n)
    sim = (df1.dtypes.values == df2.dtypes.values).mean()
    return True if sim < th else None

In [18]:
header = identify_header('adult.csv', n=5, th=0.9)

In [19]:
sensitive = ['age','area','nation','nationality','sex','gender','ethnicity','race']
candidate = []
if header:
    # Check the matched sensitive attributes
    for i in tmp_samples.columns:
        if i.lower() in sensitive:
            candidate.append(i)

# Generate Header

In [20]:
# Convert input dataset to the required form
index_range = range(len(tmp_samples.columns))
df_value = pd.DataFrame(columns = ['value'],index = index_range)
df_label = pd.DataFrame(columns = ['label'],index = index_range)

In [21]:
idx = 0
for i in tmp_samples.columns:
    unique_val = list(tmp_samples[i].unique())
    val_no_nan = [x for x in unique_val if str(x) != 'nan']
    if len(val_no_nan) == 0:
        # No value in any cell of this attribute
        val_no_nan = [i]
    df_value.at[idx,'value'] = str(val_no_nan)
    df_label.at[idx,'label'] = i
    idx += 1

In [22]:
# Load Sherlock architecture and weights from files
file = open('../models/sherlock_model.json', 'r')
sherlock_file = file.read()
sherlock = tf.keras.models.model_from_json(sherlock_file)
file.close()

sherlock.load_weights('../models/sherlock_weights.h5')
sherlock.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['categorical_accuracy'])

In [23]:
test_samples_converted, y_test = convert_string_lists_to_lists(df_value, df_label,'value','label')

100%|██████████| 15/15 [00:00<00:00, 325.85it/s]


In [24]:
X_test = extract_features(test_samples_converted)

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


In [25]:
predicted_labels = predict_sherlock(X_test, nn_id='sherlock')

In [26]:
# If there is no header, assign the generated header to the dataset
if not header:
    tmp_samples.columns = predicted_labels

In [27]:
# Filter out sensitive attributes
for i in zip(tmp_samples.columns,predicted_labels):
    if i not in candidate:
        if i in ['country','sex','age']:
            candidate.append(i)

# Test whether the attribute is categorical

In [28]:
def get_var_category(series):
    unique_count = series.nunique(dropna=False)
    total_count = len(series)
    if pd.api.types.is_numeric_dtype(series):
        return 'Numerical'
    elif pd.api.types.is_datetime64_dtype(series):
        return 'Date'
    elif unique_count==total_count:
        return 'Text (Unique)'
    else:
        return 'Categorical'

In [33]:
def filter_categories(df):
    for column_name in df.columns:
        if get_var_category(df[column_name]) == 'Categorical':
            if column_name not in candidate:
                candidate.append(column_name)
        print(column_name, ": ", get_var_category(df[column_name]))

In [34]:
filter_categories(tmp_samples)

age :  Numerical
workclass :  Categorical
fnlwgt :  Numerical
education :  Categorical
education.num :  Numerical
marital.status :  Categorical
occupation :  Categorical
relationship :  Categorical
race :  Categorical
sex :  Categorical
capital.gain :  Numerical
capital.loss :  Numerical
hours.per.week :  Numerical
native.country :  Categorical
income :  Categorical


In [36]:
def summary_age(x):
    age = {'<16':0,'16-24':0,'25-34':0,
          '35-44':0,'45-54':0,'55-64':0,'>64':0}
    for i in x:
        if i < 16:
            age['<16'] += 1
        elif i <= 24:
            age['16-24'] += 1
        elif i <= 34:
            age['25-34'] += 1
        elif i <= 44:
            age['35-44'] += 1
        elif i <= 54:
            age['45-54'] += 1
        elif i <= 64:
            age['55-64'] += 1
        else:
            age['>64'] += 1
    return age

In [37]:
data_summary = []
for i in tmp_samples.columns:
    if i in candidate:
        # It must be categorical
        if i.lower() in ['age']:
            summary = summary_age(tmp_samples[i])
        else:
            summary = tmp_samples[i].value_counts()
    data_summary.append(summary)

# Further, check what we can extend

##### First, predict the gender based on first name, if there is no gender attribute but with name attribute

In [39]:
gp = GenderPredictor()
gp.train_and_test()

import complete
32,031 male names
56,347 female names
classifier accuracy: 96.81%


In [41]:
get_gender = get_name = False
name = None
for i in zip(tmp_samples.columns, predicted_labels):
    if (i[0].lower() in ['name','first_name','first name']) or (i[1].lower() in ['name','first_name','first name']):
        get_name =True
        name = i[0]
    if i[0].lower() in ['sex','gender'] or i[1].lower() in ['sex']:
        get_gender = True
if get_name and (not get_gender):
    male = female = 0
    for i in tmp_samples[i]:
        if gp.classify(i) == 'M':
            male += 1
        else:
            female += 1
    data_summary.append({'Male':male,'Female':female})

# Try to extend combined sensitive attributes information

In [42]:
for L in range(2, len(candidate)+1):
    for subset in itertools.combinations(candidate, L):
        data_summary.append(tmp_samples.groupby(list(subset)).size())

# Save as Json

In [58]:
# As a string
#json_string = json.dumps(data_summary)