In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch
from sklearn.model_selection import KFold, StratifiedKFold
import math

In [26]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
import pandas as pd
from utils import ConfusionMatrix, Prf1a
classifiers = [
#     KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_jobs=4),
    AdaBoostClassifier()
]

In [3]:
file = 'adult/adult.csv'
kf = KFold(n_splits=10, shuffle=True)

In [4]:
def run(clf, X_train, Y_train, X_test, Y_tes, num_class):
    clf.fit(X_train, y_train)
    train_predictions = clf.predict(X_test)
    if num_class==2:
        sc = Prf1a()
    else:
        sc = ConfusionMatrix(num_class)
    sc.add(torch.FloatTensor(train_predictions), y_test)
    train_predictions = clf.predict_proba(X_test)
    return sc

In [5]:
df = pd.read_csv(file, header=0)
df = df.dropna()
df.shape

(45222, 15)

In [6]:
np.unique(df['class'], return_counts=True)

(array(['<=50K', '>50K'], dtype=object), array([34014, 11208]))

In [7]:
df['class']=df['class'].map({'<=50K': 0, '>50K': 1}).astype(int)

In [8]:
for c in df.columns:
    print(c, ': ', len(df[c].unique()))

age :  5
workclass :  7
fnlwgt :  26741
education :  16
education-num :  16
marital-status :  7
occupation :  14
relationship :  6
race :  5
sex :  2
capitalgain :  5
capitalloss :  5
hoursperweek :  5
native-country :  41
class :  2


### Delete redundant and noisy features

In [9]:
del df["relationship"]
del df['fnlwgt']
del df['education-num']

### Feature engineering

In [10]:
print('Before: ', df.workclass.unique())
df.workclass = df.workclass.replace({'Self-emp-not-inc': 'Self-emp',
                                    'Self-emp-inc': 'Self-emp',
                                    'Local-gov': 'Gov',
                                    'Federal-gov': 'Gov',
                                    'State-gov': 'Gov'})
print('After: ', df.workclass.unique())
le = LabelEncoder()
df['workclass'] = le.fit_transform(df['workclass'])

Before:  ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov'
 'Self-emp-inc' 'Without-pay']
After:  ['Gov' 'Self-emp' 'Private' 'Without-pay']


In [11]:
print('Before: ', df.education.unique())
df.education = df.education.replace({'Preschool': 'No-school',
                                        '1st-4th': 'No-school',
                                        '5th-6th': 'No-school',
                                        '7th-8th': 'No-school',
                                        '9th': 'No-school',
                                        '10th': 'No-school',
                                        '11th': 'No-school',
                                        '12th': 'No-school',
                                        'Some-college': 'College',
                                        'Assoc-voc': 'College',
                                        'Assoc-acdm': 'College'})
print('After: ', df.education.unique())

Before:  ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 '7th-8th' 'Doctorate' 'Assoc-voc' 'Prof-school' '5th-6th' '10th'
 'Preschool' '12th' '1st-4th']
After:  ['Bachelors' 'HS-grad' 'No-school' 'Masters' 'College' 'Doctorate'
 'Prof-school']


In [12]:
print('Before: ', df['marital-status'].unique())
df['marital-status'].replace(['Married-civ-spouse'], 'Married', inplace=True)
df['marital-status'].replace('Never-married', 'Not-married', inplace=True)
df['marital-status'].replace(['Divorced'], 'Separated', inplace=True)
df['marital-status'].replace(['Separated'], 'Separated', inplace=True)
df['marital-status'].replace(['Married-spouse-absent'], 'Not-married', inplace=True)
df['marital-status'].replace(['Married-AF-spouse'], 'Married', inplace=True)
print('After: ', df['marital-status'].unique())

Before:  ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
After:  ['Not-married' 'Married' 'Separated' 'Widowed']


In [13]:
print('Before: ', df['native-country'].unique())
df['native-country'].replace(['United-States'], 'N-America', inplace=True)
df['native-country'].replace(['Mexico'], 'N-America', inplace=True)
df['native-country'].replace(['Philippines'], 'Asia', inplace=True)
df['native-country'].replace(['Germany'], 'Europe', inplace=True)
df['native-country'].replace(['Puerto-Rico'], 'N-America', inplace=True)
df['native-country'].replace(['Canada'], 'N-America', inplace=True)
df['native-country'].replace(['India'], 'Asia', inplace=True)
df['native-country'].replace(['El-Salvador'], 'MS-America', inplace=True)
df['native-country'].replace(['Cuba'], 'MS-America', inplace=True)
df['native-country'].replace(['England'], 'Europe', inplace=True)
df['native-country'].replace(['Jamaica'], 'MS-America', inplace=True)
df['native-country'].replace(['Italy'], 'Europe', inplace=True)

df['native-country'].replace(['China'], 'Asia', inplace=True)
df['native-country'].replace(['Dominican-Republic'], 'MS-America', inplace=True)
df['native-country'].replace(['Vietnam'], 'Asia', inplace=True)
df['native-country'].replace(['Guatemala'], 'MS-America', inplace=True)
df['native-country'].replace(['Japan'], 'Asia', inplace=True)
df['native-country'].replace(['Columbia'], 'MS-America', inplace=True)
df['native-country'].replace(['Poland'], 'Europe', inplace=True)
df['native-country'].replace(['Taiwan'], 'Asia', inplace=True)
df['native-country'].replace(['Haiti'], 'MS-America', inplace=True)
df['native-country'].replace(['Iran'], 'Asia', inplace=True)
df['native-country'].replace(['Portugal'], 'Europe', inplace=True)
df['native-country'].replace(['Nicaragua'], 'MS-America', inplace=True)

df['native-country'].replace(['Peru'], 'MS-America', inplace=True)
df['native-country'].replace(['Greece'], 'Europe', inplace=True)
df['native-country'].replace(['Ecuador'], 'MS-America', inplace=True)
df['native-country'].replace(['France'], 'Europe', inplace=True)
df['native-country'].replace(['Ireland'], 'Europe', inplace=True)
df['native-country'].replace(['Hong'], 'Asia', inplace=True)
df['native-country'].replace(['Trinadad&Tobago'], 'MS-America', inplace=True)
df['native-country'].replace(['Cambodia'], 'Asia', inplace=True)
df['native-country'].replace(['Laos'], 'Asia', inplace=True)
df['native-country'].replace(['Thailand'], 'Asia', inplace=True)
df['native-country'].replace(['Yugoslavia'], 'Europe', inplace=True)
df['native-country'].replace(['Outlying-US(Guam-USVI-etc)'], 'N-America', inplace=True)

df['native-country'].replace(['Hungary'], 'Europe', inplace=True)
df['native-country'].replace(['Honduras'], 'MS-America', inplace=True)
df['native-country'].replace(['Scotland'], 'Europe', inplace=True)
df['native-country'].replace(['Holand-Netherlands'], 'Europe', inplace=True)
print('After: ', df['native-country'].unique())

Before:  ['United-States' 'Cuba' 'Jamaica' 'India' 'Mexico' 'Puerto-Rico'
 'Honduras' 'England' 'Canada' 'Germany' 'Iran' 'Philippines' 'Poland'
 'Columbia' 'Cambodia' 'Thailand' 'Ecuador' 'Laos' 'Taiwan' 'Haiti'
 'Portugal' 'Dominican-Republic' 'El-Salvador' 'France' 'Guatemala'
 'Italy' 'China' 'South' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands']
After:  ['N-America' 'MS-America' 'Asia' 'Europe' 'South']


In [14]:
# for c in df.columns:
#     print(f'{c}: {df[c].unique()}')
#     print(df[c].value_counts())
#     print('------------------------------')

In [15]:
obj_cols = set(df.select_dtypes(['object']).columns)
df_obj = df[obj_cols]
df_obj = pd.get_dummies(df_obj)

In [16]:
df.drop(obj_cols, axis=1, inplace=True)

In [17]:
df = df.join(df_obj)

In [18]:
x_col = set(df.columns)-{'workclass', 'class'}
df_x = df[x_col]
y_class = df['class'].values
y_workclass = df['workclass'].values

### Remove Label correlation to make prediction of attribute 'class' secure

In [19]:
scale = 0
c0, c1 = df['class'].value_counts()
print(c0, c1)

34014 11208


In [29]:
eps = 3
sensitivity = 2
df_secure = df_x.copy()
for i in df_secure.index:
    noise = np.random.laplace(scale=sensitivity/eps, size=df_secure.shape[1])
    df_secure.loc[i] += noise

In [30]:
df_secure

Unnamed: 0,education_Bachelors,marital-status_Married,occupation_Other-service,race_Asian-Pac-Islander,education_Prof-school,occupation_Exec-managerial,native-country_N-America,education_HS-grad,occupation_Priv-house-serv,native-country_South,...,capitalgain,education_Masters,race_Black,occupation_Protective-serv,occupation_Farming-fishing,occupation_Sales,sex_Female,education_College,native-country_Europe,race_Other
0,-0.313635,0.033605,-1.185084,0.529278,0.353263,-0.307092,1.837184,0.156673,0.423879,0.251714,...,1.244420,-0.548422,0.401922,0.802008,-0.048937,-0.293142,0.101156,1.244124,-3.603506,-0.338883
1,0.774608,1.410656,0.573068,-0.369875,-1.919953,0.485994,1.804345,0.862456,-0.954663,-0.085591,...,-1.874139,-1.488267,-0.034608,-0.355510,0.167512,0.084091,0.131262,3.215408,-0.297249,-0.780975
2,0.628342,-0.322836,-0.307314,0.117459,-0.496696,0.381473,2.370519,1.903539,-0.715694,-1.544771,...,-0.129965,-0.036158,-0.042146,-0.175023,0.071308,-0.080041,0.042476,-0.044091,0.440796,-0.362564
3,-0.715146,0.949998,0.588942,-0.016651,1.626161,-0.549686,0.745611,-3.271873,0.259833,-0.686063,...,-0.345936,-0.462371,0.283020,1.365933,-0.950496,0.559075,-0.711752,0.105266,0.026384,-0.578541
4,-0.307209,3.877557,-3.472104,-0.020315,0.180138,1.295997,1.657874,1.691695,0.203747,2.524074,...,-0.560958,3.161730,3.123375,-0.299960,-0.072651,-1.502264,2.032647,1.511763,0.212223,1.908018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,1.895856,0.927891,1.272068,-1.595894,-0.426692,0.112955,1.963607,1.369937,-0.799789,-1.526794,...,0.008449,-2.665657,0.504831,0.901191,2.005519,0.403613,-0.123149,0.063848,1.291549,1.853975
48837,1.308966,1.232278,0.395574,0.151498,0.855282,0.104499,1.269222,-0.024361,0.090931,0.732963,...,-0.234699,0.388488,-0.858319,0.445266,-1.127949,-0.800309,0.900829,-0.581643,0.066518,1.030327
48839,0.774205,1.104395,-0.396411,0.273290,-0.305528,0.695799,2.042440,-0.159683,-0.994668,0.771261,...,0.465283,0.169340,-0.897602,0.251667,1.847485,-0.825958,1.072405,1.981724,-1.296962,0.090104
48840,1.261847,0.057213,-0.450108,-0.316920,1.189147,0.874049,1.250558,-0.491732,1.038334,-0.049434,...,0.632075,-1.707709,0.433547,0.657922,0.293270,-2.148834,-0.935533,1.676973,0.524318,0.166844


# Classification

In [31]:
num_class = 2
X = torch.Tensor(df_secure.values)
Y = torch.Tensor(y_class if num_class==2 else y_workclass)
for clf in classifiers:
    if num_class==2:
        sc = Prf1a()
    else:
         sc = ConfusionMatrix(num_class)
    name = clf.__class__.__name__
    for sp in kf.split(X):
        x_train, y_train = X[sp[0]], Y[sp[0]]
        x_test, y_test = X[sp[1]], Y[sp[1]]
        _sc = run(clf, x_train, y_train, x_test, y_test, num_class)
        sc.accumulate(_sc)
    print(name, [round(i, 4) for i in sc.prfa()])

DecisionTreeClassifier [0.4022, 0.4174, 0.4097, 0.7018]


KeyboardInterrupt: 

### Base scores on classifying attribute 'class'
- KNeighborsClassifier [0.6565, 0.5988, 0.6263, 0.8229]
- DecisionTreeClassifier [0.6994, 0.588, 0.6389, 0.8352]
- RandomForestClassifier [0.7061, 0.6085, 0.6537, 0.8402]
- AdaBoostClassifier [0.7399, 0.6057, 0.6661, 0.8495]

### Base scores on classifying attribute 'workclass'
- KNeighborsClassifier [0.2038, 0.2221, 0.2125, 0.704]
- DecisionTreeClassifier [0.2041, 0.2358, 0.2188, 0.7242]
- RandomForestClassifier [0.2041, 0.251, 0.2251, 0.7429]
- AdaBoostClassifier [0.1858, 0.2619, 0.2174, 0.7481]

### Secure_x Score on classifying attribute 'class'
- KNeighborsClassifier [0.2394, 0.1456, 0.1811, 0.6736]
- DecisionTreeClassifier [0.2454, 0.0865, 0.1279, 0.7076]
- RandomForestClassifier [0.2487, 0.0662, 0.1046, 0.719]
- AdaBoostClassifier [0.0, 0.0, 0.0, 0.7522]

### Secure_x Score on classifying attribute 'workclass'
- KNeighborsClassifier [0.1429, 0.1437, 0.1433, 0.6375]
- DecisionTreeClassifier [0.1425, 0.1417, 0.1421, 0.6675]
- RandomForestClassifier [0.1416, 0.1371, 0.1393, 0.7015]
- AdaBoostClassifier [0.1428, 0.1052, 0.1212, 0.7364]

In [None]:
StratifiedKFold()