In [1]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

In [2]:
from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [3]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


In [4]:
# Load Sherlock architecture and weights from files
file = open('../models/sherlock_model.json', 'r')
sherlock_file = file.read()
sherlock = tf.keras.models.model_from_json(sherlock_file)
file.close()

sherlock.load_weights('../models/sherlock_weights.h5')
sherlock.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['categorical_accuracy'])

W1114 20:08:39.666452  2128 deprecation.py:506] From c:\users\tiany\anaconda3\envs\cs576\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1114 20:08:39.667453  2128 deprecation.py:506] From c:\users\tiany\anaconda3\envs\cs576\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1114 20:08:39.668954  2128 deprecation.py:506] From c:\users\tiany\anaconda3\envs\cs576\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with 

In [4]:
#tmp_samples = pd.read_csv('police_killings.csv',encoding = "ISO-8859-1")
tmp_samples = pd.read_csv('adult.csv')
#pd.DataFrame(tmp_samples.mean()).transpose().to_csv('tmp_samples.csv')
#tmp_samples_means = pd.read_csv('tmp_samples.csv', index_col=0)
#tmp_samples.fillna(tmp_samples_means.iloc[0], inplace=True)

In [6]:
# Convert input dataset to the required form
index_range = range(len(tmp_samples.columns))
df_value = pd.DataFrame(columns = ['value'],index = index_range)
df_label = pd.DataFrame(columns = ['label'],index = index_range)

In [7]:
idx = 0
for i in tmp_samples.columns:
    unique_val = list(tmp_samples[i].unique())
    val_no_nan = [x for x in unique_val if str(x) != 'nan']
    if len(val_no_nan) == 0:
        # No value in any cell of this attribute
        val_no_nan = [i]
    df_value.at[idx,'value'] = str(val_no_nan)
    df_label.at[idx,'label'] = i
    idx += 1

In [8]:
test_samples_converted, y_test = convert_string_lists_to_lists(df_value, df_label,'value','label')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 154.51it/s]


In [9]:
X_test = extract_features(test_samples_converted)

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


  regex = re.compile(pat, flags=flags)


In [10]:
predicted_labels = predict_sherlock(X_test, nn_id='sherlock')

In [11]:
tmp_samples.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [12]:
predicted_labels

array(['day', 'owner', 'address', 'day', 'position', 'notes', 'address',
       'family', 'album', 'sex', 'symbol', 'year', 'age', 'country',
       'address'], dtype=object)

'country','sex','age'

## Predict Gender based on Name

In [31]:
'pip install git+git://github.com/clintval/gender_predictor.git'
from gender_predictor import GenderPredictor

In [5]:
gp = GenderPredictor()

names.pickle does not exist... creating
names.zip does not exist... downloading
names.pickle saved
32,031 male names
56,347 female names


In [6]:
gp.train_and_test()

classifier accuracy: 97.00%


In [12]:
gp.classify('Gang')

'M'

## Age Summrization

In [20]:
def summary_age(x):
    age = {'<16':0,'16-24':0,'25-34':0,
          '35-44':0,'45-54':0,'55-64':0,'>64':0}
    for i in x:
        if i < 16:
            age['<16'] += 1
        elif i <= 24:
            age['16-24'] += 1
        elif i <= 34:
            age['25-34'] += 1
        elif i <= 44:
            age['35-44'] += 1
        elif i <= 54:
            age['45-54'] += 1
        elif i <= 64:
            age['55-64'] += 1
        else:
            age['>64'] += 1
    return age

In [23]:
summary_age(tmp_samples['age'])

{'<16': 0,
 '16-24': 5570,
 '25-34': 8479,
 '35-44': 8151,
 '45-54': 5853,
 '55-64': 3172,
 '>64': 1336}

## Test whether categorical attribute

In [19]:
if len(tmp_samples['workclass'].unique()) / tmp_samples.shape[0] < 0.2:
    print("Yes, it is categorical.")

Yes, it is categorical.


## Test whether these is a header

In [23]:
def identify_header(path, n=5, th=0.9):
    df1 = pd.read_csv(path, header='infer', nrows=n)
    df2 = pd.read_csv(path, header=None, nrows=n)
    sim = (df1.dtypes.values == df2.dtypes.values).mean()
    return 'infer' if sim < th else None

## Fairness Test

In [13]:
from scipy.stats import chi2_contingency
import itertools

In [14]:
# Get matched sensitive attributes
bias = [] # Format: sensitive, target
unbias = []
candidate = []
target = []
for i in zip(tmp_samples.columns,predicted_labels):
    if i[1] in ['country','sex','age']:
        candidate.append(i[0])
    else:
        target.append(i[0])

In [None]:
for tag in target:
    for L in range(1, len(candidate)+1):
        for subset in itertools.combinations(candidate, L):
            idx = 0
            attr_list = []
            while idx < L:
                attr_list.append(subset[idx])
                idx += 1
        
            if L == 1 and np.issubdtype(type(tmp_samples[attr_list[0]][0]),int):
                test = pd.DataFrame(columns = tmp_samples[(tag)].unique(),index = tmp_samples[attr_list[0]].drop_duplicates().values)
                test.fillna(0, inplace = True)
                for index, row in tmp_samples.iterrows():
                    test.at[row[attr_list][0],row[tag]] += 1
            else:
                test = pd.DataFrame(columns = tmp_samples[(tag)].unique(),index = tmp_samples[attr_list].drop_duplicates().values)
                test.fillna(0, inplace = True)
                for index, row in tmp_samples.iterrows():
                    test.at[tuple(row[attr_list]),row[tag]] += 1
            
            if chi2_contingency(test)[1] < 0.05:
                bias.append([attr_list,tag])
            else:
                unbias.append([attr_list,tag])

  # Remove the CWD from sys.path while we load stuff.


In [None]:
tmp_samples[['sex', 'hours.per.week']].drop_duplicates().values

In [None]:
tmp_samples.groupby(['sex', 'hours.per.week','income']).size().reset_index().rename(columns={0:''})