In [140]:
import sys
import numbers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
'pip install git+git://github.com/clintval/gender_predictor.git'
from gender_predictor import GenderPredictor
import itertools

In [120]:
from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

# Read input dataset

In [121]:
tmp_samples = pd.read_csv('adult.csv')

# Test whether the input dataset contains header

In [122]:
def identify_header(path, n=5, th=0.9):
    df1 = pd.read_csv(path, header='infer', nrows=n)
    df2 = pd.read_csv(path, header=None, nrows=n)
    sim = (df1.dtypes.values == df2.dtypes.values).mean()
    return 'infer' if sim < th else None

In [123]:
if identify_header('adult.csv', n=5, th=0.9) == 'infer':
    header = True
else:
    header = False

In [124]:
sensitive = ['age','area','nation','nationality','sex','gender','ethnicity','race']
candidate = []
if header:
    # Check the matched sensitive attributes
    for i in tmp_samples.columns:
        if i.lower() in sensitive:
            candidate.append(i)

# Generate Header

In [125]:
# Convert input dataset to the required form
index_range = range(len(tmp_samples.columns))
df_value = pd.DataFrame(columns = ['value'],index = index_range)
df_label = pd.DataFrame(columns = ['label'],index = index_range)

In [126]:
idx = 0
for i in tmp_samples.columns:
    unique_val = list(tmp_samples[i].unique())
    val_no_nan = [x for x in unique_val if str(x) != 'nan']
    if len(val_no_nan) == 0:
        # No value in any cell of this attribute
        val_no_nan = [i]
    df_value.at[idx,'value'] = str(val_no_nan)
    df_label.at[idx,'label'] = i
    idx += 1

In [127]:
# Load Sherlock architecture and weights from files
file = open('../models/sherlock_model.json', 'r')
sherlock_file = file.read()
sherlock = tf.keras.models.model_from_json(sherlock_file)
file.close()

sherlock.load_weights('../models/sherlock_weights.h5')
sherlock.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['categorical_accuracy'])

In [128]:
test_samples_converted, y_test = convert_string_lists_to_lists(df_value, df_label,'value','label')

100%|██████████| 15/15 [00:00<00:00, 337.65it/s]


In [129]:
X_test = extract_features(test_samples_converted)

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


In [130]:
predicted_labels = predict_sherlock(X_test, nn_id='sherlock')

In [131]:
# If there is no header, assign the generated header to the dataset
if not header:
    tmp_samples.columns = predicted_labels

In [132]:
# Filter out sensitive attributes
for i in zip(tmp_samples.columns,predicted_labels):
    if i not in candidate:
        if i in ['country','sex','age']:
            candidate.append(i)

# Test whether the attribute is categorical

In [133]:
def summary_age(x):
    age = {'<16':0,'16-24':0,'25-34':0,
          '35-44':0,'45-54':0,'55-64':0,'>64':0}
    for i in x:
        if i < 16:
            age['<16'] += 1
        elif i <= 24:
            age['16-24'] += 1
        elif i <= 34:
            age['25-34'] += 1
        elif i <= 44:
            age['35-44'] += 1
        elif i <= 54:
            age['45-54'] += 1
        elif i <= 64:
            age['55-64'] += 1
        else:
            age['>64'] += 1
    return age

In [134]:
data_summary = []
for i in tmp_samples.columns:
    if i in candidate:
        # It must be categorical
        if i.lower() in ['age']:
            summary = summary_age(tmp_samples[i])
        else:
            summary = tmp_samples[i].value_counts()
    else:
        # check whether it is categorical or not
        if len(tmp_samples[i].unique()) / tmp_samples.shape[0] < 0.2:
            if not(isinstance(tmp_samples[i].dtype, int) or np.issubdtype(tmp_samples[i].dtype, np.int) or 
                isinstance(tmp_samples[i].dtype, float) or np.issubdtype(tmp_samples[i].dtype, np.float)):
                # we determine it is categorical
                summary = tmp_samples[i].value_counts()
    data_summary.append(summary)

  if sys.path[0] == '':
  del sys.path[0]


# Further, check what we can extend

##### First, predict the gender based on first name, if there is no gender attribute but with name attribute

In [135]:
gp = GenderPredictor()
gp.train_and_test()

import complete
32,031 male names
56,347 female names
classifier accuracy: 96.81%


In [136]:
get_gender = get_name = False
name = None
for i in zip(tmp_samples.columns, predicted_labels):
    if (i[0].lower() in ['name','first_name','first','last','last_name','first name','last name']) or (i[1].lower() in ['name','first_name','first','last','last_name','first name','last name']):
        get_name =True
        name = i[0]
    if i[0].lower() in ['sex','gender'] or i[1].lower() in ['sex']:
        get_gender = True
if get_name and (not get_gender):
    male = female = 0
    for i in tmp_samples[i]:
        if gp.classify(i) == 'M':
            male += 1
        else:
            female += 1
    data_summary.append({'Male':male,'Female':female})

# Try to extend combined sensitive attributes information

In [141]:
candidate

['age', 'race', 'sex']

In [160]:
for L in range(2, len(candidate)+1):
    for subset in itertools.combinations(candidate, L):
        data_summary.append(tmp_samples.groupby(list(subset)).size())