This is my first attempt on Kaggle so I decided to try using the sklearn framework that I am most comfortable with right now. So far, I am able to get the accuracy to about 86.73%.

In [7]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Input data files are available in the "../input" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output."

# set pandas chained_assignment flag = None here
pd.options.mode.chained_assignment = None

In [8]:
from aif360.datasets import StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
import matplotlib.patches as patches
from aif360.algorithms.preprocessing import Reweighing
#from packages import *
#from ml_fairness import *
import matplotlib.pyplot as plt
import seaborn as sns



from IPython.display import Markdown, display

In [9]:
def preprocess_target(dframe, df_column_name):
    col = dframe[[df_column_name]]
    le_col = LabelEncoder()
    le_col.fit(np.ravel(col))
    return le_col.transform(np.ravel(col))

def preprocess_features(dframe):
    for column in dframe:
        enc = LabelEncoder()
        if(column not in ['age','education.num','fnlwgt','capital.gain','capital.loss','hours.per.week']):
            dframe[column] = enc.fit_transform(dframe[column])
    return dframe

In [10]:
# import data and preprocess
df = pd.read_csv('../../Data/adult.csv')

# select and preprocess features
le_data = LabelEncoder()
features = ['age','workclass','education','marital.status','occupation','education.num','race','sex','relationship','capital.gain','capital.loss','native.country','income']
data = df[features]
data = preprocess_features(data)

# select target
data_new = data
target = data['income']
data = data.drop('income', axis=1)

Unnamed: 0,age,workclass,education,marital.status,occupation,education.num,race,sex,relationship,capital.gain,capital.loss,native.country
0,90,0,11,6,0,9,4,0,1,0,4356,39
1,82,4,11,6,4,9,4,0,1,0,4356,39
2,66,0,15,6,0,10,2,0,4,0,4356,39
3,54,4,5,0,7,4,4,0,4,0,3900,39
4,41,4,15,5,10,10,4,0,3,0,3900,39
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,4,15,4,11,10,4,1,1,0,0,39
32557,27,4,7,2,13,12,4,0,5,0,0,39
32558,40,4,11,2,7,9,4,1,0,0,0,39
32559,58,4,11,6,1,9,4,0,4,0,0,39


In [None]:
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.4, random_state=0)

In [None]:
# select algorithm
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import AdaBoostClassifier
#clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(loss='deviance', n_estimators=100, learning_rate=1.0,max_depth=2, random_state=0)

In [None]:
# fit and predict
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [None]:
# display the relative importance of each attribute
relval = clf.feature_importances_

# horizontal bar plot of feature importance
pos = np.arange(12) + 0.5
plt.barh(pos, relval, align='center')
plt.title("Feature Importance")
plt.xlabel("")
plt.ylabel("Features")
plt.yticks(pos, ('Age','Working Class','Education','Marital Status','Occupation','Education Grade','Race','Sex','Relationship Status','Capital Gain','Capital Loss','Native Country'))
plt.grid(True)

In [None]:
# calc metrics
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
for prediction, truth in zip(predictions, y_test):
    if prediction == 0 and truth == 0:
        true_negatives += 1
    elif prediction == 0 and truth == 1:
        false_negatives += 1
    elif prediction == 1 and truth == 0:
        false_positives += 1
    elif prediction == 1 and truth == 1:
        true_positives += 1
    else:
        print ("Warning: Found a predicted label not == 0 or 1.")
        print ("All predictions should take value 0 or 1.")
        print ("Evaluating performance for processed predictions:")
        break

In [None]:
try:
    print("Test Dataset (40%):")
    print("true_positives",true_positives)
    print("true_negatives",true_negatives)
    print("false_positives",false_positives)
    print("false_negatives",false_negatives)
    total_predictions = true_negatives + false_negatives + false_positives + true_positives
    print("total predictions:",total_predictions)
    accuracy = 1.0*(true_positives + true_negatives)/total_predictions
    print("accuracy:",accuracy)
    precision = 1.0*true_positives/(true_positives+false_positives)
    print("precision:",precision)
    recall = 1.0*true_positives/(true_positives+false_negatives)
    print("recall",recall)
    f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
    print("f1",f1)
    f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
    print("f2",f2)
    print (clf)
    #print (PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
    #print (RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
    print ("")
except:
    print ("Got a divide by zero when trying out:", clf)
    print ("Precision or recall may be undefined due to a lack of true positive predicitons.")

## Fairness

In [11]:
# This DataFrame is created to stock differents models and fair metrics that we produce in this notebook
algo_metrics = pd.DataFrame(columns=['model', 'fair_metrics', 'prediction', 'probs'])

def add_to_df_algo_metrics(algo_metrics, model, fair_metrics, preds, probs, name):
    return algo_metrics.append(pd.DataFrame(data=[[model, fair_metrics, preds, probs]], columns=['model', 'fair_metrics', 'prediction', 'probs'], index=[name]))

In [12]:
def fair_metrics(dataset, pred, pred_is_dataset=False):
    if pred_is_dataset:
        dataset_pred = pred
    else:
        dataset_pred = dataset.copy()
        dataset_pred.labels = pred
    
    cols = ['statistical_parity_difference', 'equal_opportunity_difference', 'average_abs_odds_difference',  'disparate_impact', 'theil_index']
    obj_fairness = [[0,0,0,1,0]]
    
    fair_metrics = pd.DataFrame(data=obj_fairness, index=['objective'], columns=cols)
    
    for attr in dataset_pred.protected_attribute_names:
        idx = dataset_pred.protected_attribute_names.index(attr)
        privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
        unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 
        
        classified_metric = ClassificationMetric(dataset, 
                                                     dataset_pred,
                                                     unprivileged_groups=unprivileged_groups,
                                                     privileged_groups=privileged_groups)

        metric_pred = BinaryLabelDatasetMetric(dataset_pred,
                                                     unprivileged_groups=unprivileged_groups,
                                                     privileged_groups=privileged_groups)

        acc = classified_metric.accuracy()

        row = pd.DataFrame([[metric_pred.mean_difference(),
                                classified_metric.equal_opportunity_difference(),
                                classified_metric.average_abs_odds_difference(),
                                metric_pred.disparate_impact(),
                                classified_metric.theil_index()]],
                           columns  = cols,
                           index = [attr]
                          )
        fair_metrics = fair_metrics.append(row)    
    
    fair_metrics = fair_metrics.replace([-np.inf, np.inf], 2)
        
    return fair_metrics

def plot_fair_metrics(fair_metrics):
    fig, ax = plt.subplots(figsize=(20,4), ncols=5, nrows=1)

    plt.subplots_adjust(
        left    =  0.125, 
        bottom  =  0.1, 
        right   =  0.9, 
        top     =  0.9, 
        wspace  =  .5, 
        hspace  =  1.1
    )

    y_title_margin = 1.2

    plt.suptitle("Fairness metrics", y = 1.09, fontsize=20)
    sns.set(style="dark")

    cols = fair_metrics.columns.values
    obj = fair_metrics.loc['objective']
    size_rect = [0.2,0.2,0.2,0.4,0.25]
    rect = [-0.1,-0.1,-0.1,0.8,0]
    bottom = [-1,-1,-1,0,0]
    top = [1,1,1,2,1]
    bound = [[-0.1,0.1],[-0.1,0.1],[-0.1,0.1],[0.8,1.2],[0,0.25]]

    display(Markdown("### Check bias metrics :"))
    display(Markdown("A model can be considered bias if just one of these five metrics show that this model is biased."))
    for attr in fair_metrics.index[1:len(fair_metrics)].values:
        display(Markdown("#### For the %s attribute :"%attr))
        check = [bound[i][0] < fair_metrics.loc[attr][i] < bound[i][1] for i in range(0,5)]
        display(Markdown("With default thresholds, bias against unprivileged group detected in **%d** out of 5 metrics"%(5 - sum(check))))

    for i in range(0,5):
        plt.subplot(1, 5, i+1)
        ax = sns.barplot(x=fair_metrics.index[1:len(fair_metrics)], y=fair_metrics.iloc[1:len(fair_metrics)][cols[i]])
        
        for j in range(0,len(fair_metrics)-1):
            a, val = ax.patches[j], fair_metrics.iloc[j+1][cols[i]]
            marg = -0.2 if val < 0 else 0.1
            ax.text(a.get_x()+a.get_width()/5, a.get_y()+a.get_height()+marg, round(val, 3), fontsize=15,color='black')

        plt.ylim(bottom[i], top[i])
        plt.setp(ax.patches, linewidth=0)
        ax.add_patch(patches.Rectangle((-5,rect[i]), 10, size_rect[i], alpha=0.3, facecolor="green", linewidth=1, linestyle='solid'))
        plt.axhline(obj[i], color='black', alpha=0.3)
        plt.title(cols[i])
        ax.set_ylabel('')    
        ax.set_xlabel('')

In [13]:
def get_fair_metrics_and_plot(data, model, plot=False, model_aif=False):
    pred = model.predict(data).labels if model_aif else model.predict(data.features)
    # fair_metrics function available in the metrics.py file
    fair = fair_metrics(data, pred)

    if plot:
        # plot_fair_metrics function available in the visualisations.py file
        # The visualisation of this function is inspired by the dashboard on the demo of IBM aif360 
        plot_fair_metrics(fair)
        display(fair)
    
    return fair

In [14]:
#print(X)


#combine_final = [train_df, test_df]
#result = pd.concat(combine_final)
#print(result.ifany())
#print(result)
privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]
dataset_orig = StandardDataset(data_new,
                                  label_name='income',
                                  protected_attribute_names=['sex'],
                                  favorable_classes=[1],
                                  privileged_classes=[[1]])

#metric_orig_train = BinaryLabelDatasetMetric(dataset_orig, 
#                                             unprivileged_groups=unprivileged_groups,
#                                             privileged_groups=privileged_groups)
#display(Markdown("#### Original training dataset"))
#print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())


In [15]:
metric_orig_train = BinaryLabelDatasetMetric(dataset_orig, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Original training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())

#### Original training dataset

Difference in mean outcomes between unprivileged and privileged groups = -0.196276


In [16]:
import ipynbname
nb_fname = ipynbname.name()
nb_path = ipynbname.path()

from sklearn.ensemble import GradientBoostingClassifier
import pickle

data_orig_train, data_orig_test = dataset_orig.split([0.7], shuffle=True)
X_train = data_orig_train.features
y_train = data_orig_train.labels.ravel()

X_test = data_orig_test.features
y_test = data_orig_test.labels.ravel()
num_estimators = 100

model = GradientBoostingClassifier(loss='deviance', n_estimators=1, learning_rate=1.0,max_depth=2, random_state=0)

mdl = model.fit(X_train, y_train)
with open('../../Results/GBC/' + nb_fname + '.pkl', 'wb') as f:
        pickle.dump(mdl, f)

with open('../../Results/GBC/' + nb_fname + '_Train' + '.pkl', 'wb') as f:
    pickle.dump(data_orig_train, f) 
    
with open('../../Results/GBC/' + nb_fname + '_Test' + '.pkl', 'wb') as f:
    pickle.dump(data_orig_test, f) 

In [17]:
from csv import writer
from sklearn.metrics import accuracy_score, f1_score

final_metrics = []
accuracy = []
f1= []

for i in range(1,num_estimators+1):
    
    model = GradientBoostingClassifier(n_estimators= i, learning_rate=1.0,max_depth=2, random_state=0, loss='deviance')
    mdl = model.fit(X_train, y_train)
    yy = mdl.predict(X_test)
    accuracy.append(accuracy_score(y_test, yy))
    f1.append(f1_score(y_test, yy))
    fair = get_fair_metrics_and_plot(data_orig_test, mdl)                           
    fair_list = fair.iloc[1].tolist()
    fair_list.insert(0, i)
    final_metrics.append(fair_list)


In [18]:
import numpy as np
final_result = pd.DataFrame(final_metrics)
final_result[4] = np.log(final_result[4])
final_result = final_result.transpose()
final_result.loc[0] = f1  # add f1 and acc to df
acc = pd.DataFrame(accuracy).transpose()
acc = acc.rename(index={0: 'accuracy'})
final_result = pd.concat([acc,final_result])
final_result = final_result.rename(index={0: 'f1', 1: 'statistical_parity_difference', 2: 'equal_opportunity_difference', 3: 'average_abs_odds_difference', 4: 'disparate_impact', 5: 'theil_index'})
final_result.columns = ['T' + str(col) for col in final_result.columns]
final_result.insert(0, "classifier", final_result['T' + str(num_estimators - 1)])   ##Add final metrics add the beginning of the df
final_result.to_csv('../../Results/GBC/' + nb_fname + '.csv')
final_result

Unnamed: 0,classifier,T0,T1,T2,T3,T4,T5,T6,T7,T8,...,T90,T91,T92,T93,T94,T95,T96,T97,T98,T99
accuracy,0.869997,0.821067,0.839595,0.846453,0.843587,0.843075,0.844099,0.845225,0.847067,0.84891,...,0.868871,0.868871,0.868666,0.868871,0.868769,0.868769,0.868564,0.869178,0.86928,0.869997
f1,0.706968,0.523186,0.604792,0.62908,0.611986,0.613367,0.611975,0.617796,0.625564,0.63247,...,0.705043,0.704907,0.7039,0.704907,0.704744,0.704472,0.703875,0.705801,0.705557,0.706968
statistical_parity_difference,-0.183827,-0.166884,-0.14383,-0.150066,-0.182743,-0.185535,-0.17982,-0.16169,-0.15273,-0.153993,...,-0.187507,-0.187657,-0.187339,-0.187657,-0.187811,-0.183986,-0.18551,-0.184453,-0.18429,-0.183827
equal_opportunity_difference,-0.072391,-0.246539,-0.056392,-0.060173,-0.238734,-0.227913,-0.223696,-0.133076,-0.077557,-0.082003,...,-0.084936,-0.087698,-0.091728,-0.087698,-0.087698,-0.076918,-0.085704,-0.072889,-0.071394,-0.072391
average_abs_odds_difference,0.070864,0.160599,0.056297,0.058011,0.154799,0.151742,0.146839,0.097278,0.067588,0.069127,...,0.078537,0.079806,0.081486,0.079806,0.079918,0.073174,0.07785,0.071554,0.070924,0.070864
disparate_impact,-1.199121,-2.142558,-1.135429,-1.125965,-1.727185,-1.7209,-1.701447,-1.366819,-1.211665,-1.19798,...,-1.226642,-1.229974,-1.234349,-1.229974,-1.230556,-1.197055,-1.21361,-1.196169,-1.200879,-1.199121
theil_index,0.106252,0.17003,0.14353,0.135172,0.141621,0.140911,0.14176,0.139657,0.136899,0.134469,...,0.106709,0.106788,0.107254,0.106788,0.106822,0.106981,0.107209,0.106447,0.106651,0.106252
