### Preprocessing

In [45]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk

original_data_train = pd.read_csv(
    "adult.data",
    names=[
        "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "OrigEthn", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"],
    sep=r'\s*,\s*',
    engine='python',
    na_values="?")

original_data_test = pd.read_csv(
    "adult.test",
    names=[
        "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "OrigEthn", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"],
    sep=r'\s*,\s*',
    engine='python',
    na_values="?")


original_data = pd.concat([original_data_test,original_data_train])
original_data.reset_index(inplace = True, drop = True)

original_data.tail()



Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,OrigEthn,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
48838,27,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
48839,40,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
48840,58,Private,151910.0,HS-grad,9.0,Widowed,Adm-clerical,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
48841,22,Private,201490.0,HS-grad,9.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,20.0,United-States,<=50K
48842,52,Self-emp-inc,287927.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024.0,0.0,40.0,United-States,>50K


In [46]:
data=original_data.copy()


data['Child'] = np.where(data['Relationship']=='Own-child', 'ChildYes', 'ChildNo')
data['OrigEthn'] = np.where(data['OrigEthn']=='White', 'CaucYes', 'CaucNo')

data=data.drop(columns=['fnlwgt','Relationship','Country','Education'])

data=data.replace('<=50K.','<=50K')
data=data.replace('>50K.','>50K')

data.tail()


Unnamed: 0,Age,Workclass,Education-Num,Martial Status,Occupation,OrigEthn,Sex,Capital Gain,Capital Loss,Hours per week,Target,Child
48838,27,Private,12.0,Married-civ-spouse,Tech-support,CaucYes,Female,0.0,0.0,38.0,<=50K,ChildNo
48839,40,Private,9.0,Married-civ-spouse,Machine-op-inspct,CaucYes,Male,0.0,0.0,40.0,>50K,ChildNo
48840,58,Private,9.0,Widowed,Adm-clerical,CaucYes,Female,0.0,0.0,40.0,<=50K,ChildNo
48841,22,Private,9.0,Never-married,Adm-clerical,CaucYes,Male,0.0,0.0,20.0,<=50K,ChildYes
48842,52,Self-emp-inc,9.0,Married-civ-spouse,Exec-managerial,CaucYes,Female,15024.0,0.0,40.0,>50K,ChildNo


In [47]:
data_ohe=data.copy()

data_ohe['Target'] = np.where(data_ohe['Target']=='>50K', 1., 0.)
print(' -> In column Target: label >50K gets 1.')

data_ohe['OrigEthn'] = np.where(data_ohe['OrigEthn']=='CaucYes', 1., 0.)
print(' -> In column '+str('OrigEthn')+': label '+str('CaucYes')+' gets 1.')

data_ohe['Sex'] = np.where(data_ohe['Sex']=='Male', 1., 0.)
print(' -> In column '+str('Sex')+': label '+str('Male')+' gets 1.')

for col in ['Workclass', 'Martial Status', 'Occupation', 'Child']:
    if len(set(list(data_ohe[col])))==2:
        LabelThatGets1=data_ohe[col][0]
        data_ohe[col] = np.where(data_ohe[col]==LabelThatGets1, 1., 0.)
        print(' -> In column '+str(col)+': label '+str(LabelThatGets1)+' gets 1.')
    else:
        print(' -> In column '+str(col)+': one-hot encoding conversion with labels '+str(set(list(data_ohe[col]))))
        data_ohe=pd.get_dummies(data_ohe,prefix=[col],columns=[col])

data_ohe.tail()

 -> In column Target: label >50K gets 1.
 -> In column OrigEthn: label CaucYes gets 1.
 -> In column Sex: label Male gets 1.
 -> In column Workclass: one-hot encoding conversion with labels {'State-gov', nan, 'Federal-gov', 'Self-emp-inc', 'Self-emp-not-inc', 'Local-gov', 'Never-worked', 'Private', 'Without-pay', None}
 -> In column Martial Status: one-hot encoding conversion with labels {'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse', None, 'Separated', 'Divorced', 'Never-married', 'Widowed'}
 -> In column Occupation: one-hot encoding conversion with labels {'Priv-house-serv', nan, 'Transport-moving', 'Craft-repair', 'Tech-support', 'Farming-fishing', None, 'Adm-clerical', 'Protective-serv', 'Exec-managerial', 'Machine-op-inspct', 'Other-service', 'Armed-Forces', 'Handlers-cleaners', 'Prof-specialty', 'Sales'}
 -> In column Child: label ChildNo gets 1.


Unnamed: 0,Age,Education-Num,OrigEthn,Sex,Capital Gain,Capital Loss,Hours per week,Target,Child,Workclass_Federal-gov,...,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving
48838,27,12.0,1.0,0.0,0.0,0.0,38.0,0.0,1.0,0,...,0,0,0,0,0,0,0,0,1,0
48839,40,9.0,1.0,1.0,0.0,0.0,40.0,1.0,1.0,0,...,0,0,1,0,0,0,0,0,0,0
48840,58,9.0,1.0,0.0,0.0,0.0,40.0,0.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
48841,22,9.0,1.0,1.0,0.0,0.0,20.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
48842,52,9.0,1.0,0.0,15024.0,0.0,40.0,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
data_ohe=data_ohe.iloc[1:]

Finally extract the input and output (X and y) matrices as np.arrays for further analyses using sklearn, and split them into a learning and test sample. 

In [49]:
#extract the X and y np.arrays
y=data_ohe['Target'].values.reshape(-1,1)

data_ohe_wo_target=data_ohe.drop(columns=['Target'])

X_col_names=list(data_ohe_wo_target.columns)
X=data_ohe_wo_target.values


#split the learning and test samples
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#print the np.array shapes 
print('n_train=',X_train.shape[0])
print('n_test=',X_test.shape[0])
print('p=',X_test.shape[1])

#center-reduce the arrays X_train and X_test to make sure all variables have the same scale
X_train=sk.preprocessing.scale(X_train)
X_test_NoScaling=X_test.copy()
X_test=sk.preprocessing.scale(X_test)

n_train= 32724
n_test= 16118
p= 37



### Classification with decision tree

We now train a <i>Decsion tree</i> on the data, which has the interest to be straightforwardly interpretable.


In [50]:
from sklearn.tree import DecisionTreeClassifier

clf_DT=DecisionTreeClassifier(max_depth=5)
clf_DT.fit(X_train,y_train.ravel())

y_test_pred_DT = clf_DT.predict(X_test)

#*** Uncomment the three raws below to see the decision rules ***
#from sklearn import tree
#dot_data = tree.export_graphviz(DTC_clf)
#print(dot_data)

Let's see now the prediction accuracy

In [51]:
acc=accuracy_score(y_test.ravel(),y_test_pred_DT.ravel())
tpr=np.sum((y_test_pred_DT.ravel()==1)*(y_test.ravel()==1)) / np.sum(y_test_pred_DT.ravel()==1)
tnr=np.sum((y_test_pred_DT.ravel()==0)*(y_test.ravel()==0)) / np.sum(y_test_pred_DT.ravel()==0)

cm = metrics.confusion_matrix(y_test.ravel(),y_test_pred_DT.ravel(),labels=[0,1])

print("\nAccuracy =",acc)
print("True positive rate =",tpr)  #Rem: Equivalent to metrics.precision_score(y_test.ravel(), y_test_pred.ravel(), pos_label=1.)
print("True negative rate =",tnr)
print('\nConfusion matrix =')
print(cm)


Accuracy = 0.8493609628986226
True positive rate = 0.7537207654145995
True negative rate = 0.8696600481347774

Confusion matrix =
[[11563   695]
 [ 1733  2127]]


This is the best average result so far. Remark however that altough the prediction are clearly the best for the obesrvations with y=0, they are clearly poor for y=1, which contains about 24&percnt; of the observations. There are even more false positive than true positive predictions.

### Classfication with positive discrimination

In [52]:
import random
def discriminate(X,y, p):
    n=len(y.ravel())
    nb_changes=int(n*p)
    # Randomly select and change the values
    indices_to_change = random.sample(range(n), nb_changes)
    z=y
    for idx in indices_to_change:
        if X[idx][X_col_names.index('Sex')]<0 and z[idx]==0.0:
            z[idx] = 1 - z[idx]
    return z



In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

# Define the number of splits (k) for cross-validation
num_splits = 10
disc_rate=0.0 #1 for full discrimination, 0 for no discrimination 
known_DI=0.35
p=0
C=1.5
clf_disc=DecisionTreeClassifier(max_depth=5)

S=X_test_NoScaling[:,X_col_names.index('Sex')].ravel()




# Create a KFold object to split the data
kf = KFold(n_splits=num_splits, shuffle=True, random_state=41)

# Split your data and train the model on each split
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train.ravel()[train_index], y_train.ravel()[test_index]


    # Train the model on the current fold
    clf_disc.fit(X_train_fold, discriminate(X_train_fold, y_train_fold,p))
    y_test_pred_fold = clf_disc.predict(X_test_fold)
    acc=accuracy_score(y_test_fold.ravel(),y_test_pred_fold.ravel())
    tpr=np.sum((y_test_pred_fold.ravel()==1)*(y_test_fold.ravel()==1)) / np.sum(y_test_pred_fold.ravel()==1)
    tnr=np.sum((y_test_pred_fold.ravel()==0)*(y_test_fold.ravel()==0)) / np.sum(y_test_pred_fold.ravel()==0)

    cm = metrics.confusion_matrix(y_test_fold.ravel(),y_test_pred_fold.ravel(),labels=[0,1])

    print("\nAccuracy =",acc)
    print("True positive rate =",tpr)  #Rem: Equivalent to metrics.precision_score(y_test.ravel(), y_test_pred.ravel(), pos_label=1.)
    print("True negative rate =",tnr)
    print('\nConfusion matrix =')
    print(cm)

    S=X_test_fold[:,X_col_names.index('Sex')].ravel()
    DI=len([i for i in range(len(S)) if (y_test_pred_fold[i]==1.0 and S[i]<0)])/ len([i for i in range(len(S)) if (y_test_pred_fold[i]==1.0 and S[i]>0)])
    print('DI:',DI)
    if (known_DI-DI)*C>0 and (known_DI-DI)*C<1:
        p=(known_DI-DI)*C
        print(p)
    

    
print('-------------------TOTAL----------------------')

from sklearn.tree import DecisionTreeClassifier
y_test_pred_disc= clf_disc.predict(X_test)

acc=accuracy_score(y_test.ravel(),y_test_pred_disc.ravel())
tpr=np.sum((y_test_pred_disc.ravel()==1)*(y_test.ravel()==1)) / np.sum(y_test_pred_disc.ravel()==1)
tnr=np.sum((y_test_pred_disc.ravel()==0)*(y_test.ravel()==0)) / np.sum(y_test_pred_disc.ravel()==0)

cm = metrics.confusion_matrix(y_test.ravel(),y_test_pred_disc.ravel(),labels=[0,1])

print("\nAccuracy =",acc)
print("True positive rate =",tpr)  #Rem: Equivalent to metrics.precision_score(y_test.ravel(), y_test_pred.ravel(), pos_label=1.)
print("True negative rate =",tnr)
print('\nConfusion matrix =')
print(cm)


S=X_test_NoScaling[:,X_col_names.index('Sex')].ravel()

y_test_pred_disc = clf_disc.predict(X_test).ravel()
DI=len([i for i in range(len(S)) if (y_test_pred_disc[i]==1 and S[i]==0)])/ len([i for i in range(len(S)) if (y_test_pred_disc[i]==1 and S[i]==1)])


print('DI y_test_pred_disc =',DI)



Accuracy = 0.8548732050106935
True positive rate = 0.7545454545454545
True negative rate = 0.8751377157546824

Confusion matrix =
[[2383  135]
 [ 340  415]]
DI: 0.17270788912579957
0.26593816631130063

Accuracy = 0.8463183623586923
True positive rate = 0.7407407407407407
True negative rate = 0.8659420289855072

Confusion matrix =
[[2390  133]
 [ 370  380]]
DI: 0.14
0.31499999999999995

Accuracy = 0.84967919340055
True positive rate = 0.7544483985765125
True negative rate = 0.8694208779048321

Confusion matrix =
[[2357  138]
 [ 354  424]]
DI: 0.1446028513238289
0.3080957230142566

Accuracy = 0.8414298808432631
True positive rate = 0.7152899824253075
True negative rate = 0.8679733727810651

Confusion matrix =
[[2347  162]
 [ 357  407]]
DI: 0.1565040650406504
0.29024390243902437

Accuracy = 0.8533007334963325
True positive rate = 0.7565217391304347
True negative rate = 0.8739340007415647

Confusion matrix =
[[2357  140]
 [ 340  435]]
DI: 0.12524461839530332
0.337133072407045

Accuracy = 