In [2]:

import numpy as np
import pandas as pd
from tqdm import tqdm

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\
        import load_preproc_data_adult
from aif360.explainers import MetricTextExplainer, MetricJSONExplainer

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import tree
from sklearn import metrics
from collections import Counter
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
import seaborn as sns
from aif360.sklearn import metrics
import graphviz
#from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

#from common_utils import compute_metrics

all_metrics =  ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]



#Load the data
adult_data = pd.read_csv('C:/Users/thano/Υπολογιστής/ΠΤΥΧΙΑΚΗ/DATASETS/adult.csv',sep= ',', header= 0)

In [3]:
print ("Dataset Lenght:: ", len(adult_data))
print ("Dataset Shape:: ", adult_data.shape)
#adult_data.sample(50)


print ("Rows     : " ,adult_data.isnull().shape[0])
print ("Columns  : " ,adult_data.isnull().shape[1])
print ("\nMissing values :  ", adult_data.isnull().sum().values.sum())



Dataset Lenght::  32561
Dataset Shape::  (32561, 15)
Rows     :  32561
Columns  :  15

Missing values :   0


In [4]:
adult_data = adult_data.replace('?', np.nan)
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
#remove column 'fnlwgt' because it's not strongly connected to protected attributes
adult_data = adult_data.drop("fnlwgt", axis=1)
#adult_data = adult_data.drop("capital.gain", axis=1)

In [None]:
adult_data.head()

In [6]:
# rename the other values except White value in race column for future use
adult_data['race']= adult_data['race'].replace(['Black','Asian-Pac-Islander', 'Amer-Indian-Eskimo'], 'Other')
adult_data.head(20)

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,Some-college,10,Widowed,,Unmarried,Other,Female,0,4356,40,United-States,<=50K
3,54,Private,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,,>50K


In [None]:
#plot the categorical count of each protected category
lis2 = ['race', 'sex']
plt.subplots(figsize=(10, 5))
index = 1
  
for col in lis2:
    y = adult_data[col].value_counts()
    plt.subplot(1, 2, index)
    plt.xticks(rotation=90)
    sns.barplot(x=list(y.index), y=y)
    index += 1

In [None]:
#plot the distributions
lis = ['education-num', 'hoursperweek', 'class']
plt.subplots(figsize=(15, 8))
index = 1
  
for i in lis:
    plt.subplot(2, 2, index)
    sns.distplot(adult_data[i])
    index += 1

In [None]:
#check for missing values
adult_check_missing = (adult_data.isnull()).sum()
adult_check_missing

In [7]:
#mode imputation for categorical mv and mean imputation for numeric mv
#adult.fillna(df.mean())
adult_mode = adult_data.fillna(adult_data.mode().iloc[0])
adult_mode.head(15)

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,Some-college,10,Widowed,Prof-specialty,Unmarried,Other,Female,0,4356,40,United-States,<=50K
3,54,Private,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,United-States,>50K


In [9]:
# select all categorical variables
adult_data_categorical = adult_mode.select_dtypes(include=['object'])
# select all numeric variables
#adult_data_numeric = adult_data.select_dtypes(include= 'number')
# mean imputation for numeric missing values
#adult_mean = adult_data_numeric.fillna(adult_data_numeric.mean())
# mode imputation for categorical missing values
#adult_mode = adult_data_categorical.fillna(adult_data_categorical.mode().iloc[0])
# concat to the final dataset
#adult = pd.concat([adult_mode,adult_mean],axis=1)
#adult.head(5)


In [10]:
# apply label encoder to categorical values
le = preprocessing.LabelEncoder()
adult_data_categorical = adult_data_categorical.apply(le.fit_transform)
adult_data_categorical.head()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
0,3,11,6,9,1,1,0,38,0
1,3,11,6,3,1,1,0,38,0
2,3,15,6,9,4,0,0,38,0
3,3,5,0,6,4,1,0,38,0
4,3,15,5,9,3,1,0,38,0


In [12]:
#first, Drop earlier duplicate columns which had categorical values and concat with new labelled categories (mean-mode function)
adult_mode = adult_mode.drop(adult_data_categorical.columns,axis=1)
adult_mode = pd.concat([adult_mode,adult_data_categorical],axis=1)
adult_mode.head(5)


Unnamed: 0,age,education.num,capital.gain,capital.loss,hours.per.week,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
0,90,9,0,4356,40,3,11,6,9,1,1,0,38,0
1,82,9,0,4356,18,3,11,6,3,1,1,0,38,0
2,66,10,0,4356,40,3,15,6,9,4,0,0,38,0
3,54,4,0,3900,40,3,5,0,6,4,1,0,38,0
4,41,10,0,3900,40,3,15,5,9,3,1,0,38,0


In [None]:
target = adult_mode.values[:,-1]
counter = Counter(target)
for k,v in counter.items():
	per = v / len(target) * 100
	print('Class=%d, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
adult_mode.head()

In [1]:
#MinMax scalling
scaler = MinMaxScaler()
adult = scaler.fit_transform(adult_mode)
#X_test_scaled = scaler.transform(X_test)


NameError: name 'MinMaxScaler' is not defined

In [None]:
adult_mode = pd.DataFrame(adult,columns = adult_mode.columns)
#adult = pd.DataFrame(X_test,  columns = ["age","education","education.num","marital.status","relationship","race","sex","capital.gain","capital.loss","hours.per.week"])
adult_mode.head()                    

In [None]:
# convert target variable income to categorical
adult_mode['income']= adult_mode['income'].apply(np.int64)

In [None]:
# Putting independent variables/features to X
X = adult_mode.drop('income',axis=1)
# Putting response/dependent variable/feature to y
y = adult_mode['income']


In [None]:
# Splitting the data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

#privileged_groups = [{'sex' :1}]
#unprivileged_groups = [{'sex' :0}]
privileged_groups = [{'race' :1.0}]
unprivileged_groups = [{'race' : 0.0}]

In [None]:
#convert the dataset in binary dataset in order to apply spd 
binaryLabelDataset = BinaryLabelDataset(
    favorable_label=0,
    unfavorable_label=1,
    df= adult_mode,
    label_names=['income'],
    #protected_attribute_names=['race'])
    protected_attribute_names=['sex'])

In [None]:
#SPD without classification
metric_orig_train = BinaryLabelDatasetMetric(binaryLabelDataset,unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups)
# Load and create explainers

text_exp_otr = MetricTextExplainer(metric_orig_train)
# Print statistical parity difference
print(text_exp_otr.statistical_parity_difference())

In [None]:
#SPD computation
from aif360.metrics import BinaryLabelDatasetMetric
metric_orig_train = BinaryLabelDatasetMetric(binaryLabelDataset,unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups)
display(Markdown("#### Original training dataset"))
print("Staistical Parity Difference between unprivileged and privileged groups = %f" % metric_orig_train.statistical_parity_difference())

In [None]:
y_train.head(30)

In [None]:
# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is the best each time
from sklearn.model_selection import GridSearchCV
tree_param = {'criterion':['gini','entropy'],'max_depth':range(2,20)}
dt_default = GridSearchCV(DecisionTreeClassifier(), tree_param)
dt_default.fit(X_train,y_train)

print('Best score: {}'.format(dt_default.best_score_))
print('Best parameters: {}'.format(dt_default.best_params_))


In [None]:
#find the best parameters for the classifier and fit it to the data 
dt_best = DecisionTreeClassifier(criterion = 'gini', max_depth = 8)
dt_best.fit(X_train,y_train)

In [None]:
# making predictions
y_pred_default = dt_best.predict(X_test)
y_pred_default.shape

In [None]:
# Printing classifier report after prediction
from sklearn import metrics
accuracy_scorer = metrics.make_scorer(metrics.accuracy_score)
print("\nAccuracy Score is: " + str(metrics.accuracy_score(y_test, y_pred_default)))

In [None]:
# create a dot_file which stores the tree structure
X_col_names = list(X_train.columns)
feature_names = X_col_names
dot_data = export_graphviz(dt_best,feature_names= feature_names, rounded = True,filled = True)

In [None]:
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png("myTree.png")

In [None]:
# Show graph
Image(graph.create_png())

In [None]:
#true negatives and true positives
tpr=np.sum((y_pred_default.ravel()==1)*(y_test.ravel()==1)) / np.sum(y_pred_default.ravel()==1)
tnr=np.sum((y_pred_default.ravel()==0)*(y_test.ravel()==0)) / np.sum(y_pred_default.ravel()==0)
# Printing accuracy
print("Accuracy score=",accuracy_score(y_test,y_pred_default))
print("True positive rate =",tpr)  
print("True negative rate =",tnr)

In [None]:
X_test = pd.concat([X_test,y_test],axis=1)
X_test

In [None]:
binaryLabelDataset = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df= X_test,
    label_names=['income'],
    protected_attribute_names=['race'])

In [None]:
from aif360.metrics import BinaryLabelDatasetMetric

def fair_metrics(binaryLabelDataset, y_pred_default):
    dataset_pred = binaryLabelDataset.copy()
    dataset_pred.labels = y_pred_default
        
    attr = dataset_pred.protected_attribute_names[0]
    
    idx = dataset_pred.protected_attribute_names.index(attr)
    privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
    unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 

    metric_pred = BinaryLabelDatasetMetric(dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    result = {'statistical_parity_difference': metric_pred.statistical_parity_difference()}
        
    return result


fair_metrics(binaryLabelDataset, y_pred_default)

In [None]:
from aif360.metrics import BinaryLabelDatasetMetric
metric_orig_train = BinaryLabelDatasetMetric(binaryLabelDataset,unprivileged_groups=unprivileged_groups,privileged_groups=privileged_groups)
display(Markdown("#### Original training dataset"))
print("Staistical Parity Difference between unprivileged and privileged groups = %f" % metric_orig_train.statistical_parity_difference())

In [None]:
# list of models
models = [LogisticRegression(max_iter=500), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(), GaussianNB()]

In [None]:
#function for comparing the accuracy of 4 classifiers after 10-fold cross validation
def compare_models_cross_validation():

  for model in models:

    cv_score = cross_val_score(model, X,y, cv=10)
    
    mean_accuracy = sum(cv_score)/len(cv_score)

    mean_accuracy = mean_accuracy*100

    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for ', model, '=  ', cv_score)
    print('Accuracy % of the ', model, mean_accuracy )
    print('----------------------------------------------')

In [None]:
#Execute the function
compare_models_cross_validation()

In [None]:
#Train with the best classifier based on accuracyrb 
from sklearn import model_selection


In [None]:
knn = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
  
# defining parameter range
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', return_train_score=False,verbose=1)
  
# fitting the model for grid search
grid_search=grid.fit(X_train, y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=10)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

In [None]:
X_test = pd.concat([X_test,y_test],axis=1)
X_test

In [None]:
binaryLabelDataset3 = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df= X_test,
    label_names=['income'],
    protected_attribute_names=['race'])

In [None]:
from aif360.metrics import BinaryLabelDatasetMetric

def fair_metrics(binaryLabelDataset3, y_pred):
    dataset_pred = binaryLabelDataset3.copy()
    dataset_pred.labels = y_pred
        
    attr = dataset_pred.protected_attribute_names[0]
    
    idx = dataset_pred.protected_attribute_names.index(attr)
    privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
    unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 

    metric_pred = BinaryLabelDatasetMetric(dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    result = {'statistical_parity_difference': metric_pred.statistical_parity_difference()}
        
    return result


fair_metrics(binaryLabelDataset3, y_pred)