In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from copy import deepcopy

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_validate
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

### Read datasets

In [2]:
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", 
          "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "outcome"]

orig_train = pd.read_csv("datasets/adult/adult.data", names=columns, index_col=False)
orig_test = pd.read_csv("datasets/adult/adult.test", names=columns, index_col=False)

In [3]:
orig_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
orig_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


### Drop a few columns

In [5]:
# drop columns
train = orig_train.drop(columns=["education-num", "fnlwgt", "relationship"], axis=1)
test = orig_test.drop(columns=["education-num", "fnlwgt", "relationship"], axis=1)

### Modify  workclass column

In [6]:
dfs = [train, test]

for data in dfs:
    # replace values from workclass columns
    data["workclass"] = data["workclass"].replace(to_replace=' Federal-gov', value=' Government')
    data["workclass"] = data["workclass"].replace(to_replace=' Local-gov', value=' Government')
    data["workclass"] = data["workclass"].replace(to_replace=' State-gov', value=' Government')

    data["workclass"] = data["workclass"].replace(to_replace=' Self-emp-inc', value=' Self-Employed')
    data["workclass"] = data["workclass"].replace(to_replace=' Self-emp-not-inc', value=' Self-Employed')

    data["workclass"] = data["workclass"].replace(to_replace=' Never-worked', value=' Other')
    data["workclass"] = data["workclass"].replace(to_replace=' Without-pay', value=' Other')
    data["workclass"] = data["workclass"].replace(to_replace=' Other', value=' Other/Unknown')
    data["workclass"] = data["workclass"].replace(to_replace=' ?', value=' Other/Unknown')

    print(np.unique(data['workclass'], return_counts=True))

(array([' Government', ' Other/Unknown', ' Private', ' Self-Employed'],
      dtype=object), array([ 4351,  1857, 22696,  3657]))
(array([' Government', ' Other/Unknown', ' Private', ' Self-Employed'],
      dtype=object), array([ 2198,   973, 11210,  1900]))


### Modify occupation class

In [7]:
for data in dfs:
    # replace values from ocupation
    data["occupation"] = data["occupation"].replace(to_replace=' Adm-clerical', value=' White-Collar')
    data["occupation"] = data["occupation"].replace(to_replace=' Craft-repair', value=' Blue-Collar')
    data["occupation"] = data["occupation"].replace(to_replace=' Exec-managerial', value=' White-Collar')
    data["occupation"] = data["occupation"].replace(to_replace=' Farming-fishing', value=' Blue-Collar')
    data["occupation"] = data["occupation"].replace(to_replace=' Handlers-cleaners', value=' Blue-Collar')
    data["occupation"] = data["occupation"].replace(to_replace=' Machine-op-inspct', value=' Blue-Collar')
    data["occupation"] = data["occupation"].replace(to_replace=' Other-service', value=' Service')
    data["occupation"] = data["occupation"].replace(to_replace=' Priv-house-serv', value=' Service')
    data["occupation"] = data["occupation"].replace(to_replace=' Prof-specialty', value=' Professional')
    data["occupation"] = data["occupation"].replace(to_replace=' Protective-serv', value=' Service')
    data["occupation"] = data["occupation"].replace(to_replace=' Tech-support', value=' Service')
    data["occupation"] = data["occupation"].replace(to_replace=' Transport-moving', value=' Blue-Collar')
    data["occupation"] = data["occupation"].replace(to_replace=' ?', value=' Other/Unknown')
    data["occupation"] = data["occupation"].replace(to_replace=' Armed-Forces', value=' Other/Unknown')

    print(np.unique(data["occupation"], return_counts=True))

(array([' Blue-Collar', ' Other/Unknown', ' Professional', ' Sales',
       ' Service', ' White-Collar'], dtype=object), array([10062,  1852,  4140,  3650,  5021,  7836]))
(array([' Blue-Collar', ' Other/Unknown', ' Professional', ' Sales',
       ' Service', ' White-Collar'], dtype=object), array([4989,  972, 2032, 1854, 2573, 3861]))


In [8]:
for data in dfs:
    # replace values from marital status
    data["marital-status"] = data["marital-status"].replace(to_replace=' Married-AF-spouse', value=' Married')
    data["marital-status"] = data["marital-status"].replace(to_replace=' Married-civ-spouse', value=' Married')
    data["marital-status"] = data["marital-status"].replace(to_replace=' Married-spouse-absent', value=' Married')
    data["marital-status"] = data["marital-status"].replace(to_replace=' Never-married', value=' Single')

    print(np.unique(data["marital-status"], return_counts=True))

(array([' Divorced', ' Married', ' Separated', ' Single', ' Widowed'],
      dtype=object), array([ 4443, 15417,  1025, 10683,   993]))
(array([' Divorced', ' Married', ' Separated', ' Single', ' Widowed'],
      dtype=object), array([2190, 7627,  505, 5434,  525]))


### Drop a few more columns

In [9]:
train = train.drop(["capital-gain", "capital-loss", "native-country"], axis=1)
test = test.drop(["capital-gain", "capital-loss", "native-country"], axis=1)

In [10]:
train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,39,Government,Bachelors,Single,White-Collar,White,Male,40,<=50K
1,50,Self-Employed,Bachelors,Married,White-Collar,White,Male,13,<=50K
2,38,Private,HS-grad,Divorced,Blue-Collar,White,Male,40,<=50K
3,53,Private,11th,Married,Blue-Collar,Black,Male,40,<=50K
4,28,Private,Bachelors,Married,Professional,Black,Female,40,<=50K


In [11]:
test.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,25,Private,11th,Single,Blue-Collar,Black,Male,40,<=50K.
1,38,Private,HS-grad,Married,Blue-Collar,White,Male,50,<=50K.
2,28,Government,Assoc-acdm,Married,Service,White,Male,40,>50K.
3,44,Private,Some-college,Married,Blue-Collar,Black,Male,40,>50K.
4,18,Other/Unknown,Some-college,Single,Other/Unknown,White,Female,30,<=50K.


### Discretize continous columns

In [12]:
disc_train = train.copy()
disc_test = test.copy()
cont_columns = ['age', 'hours-per-week']
kbins = {}

for col in cont_columns:
    # apply discretization for the continous column for train dataset
    kbin = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    support = np.concatenate((disc_train[col].to_numpy(), disc_test[col].to_numpy())).reshape(-1, 1)
    kbin.fit(support)
    disc_train[col] = kbin.transform(disc_train[col].to_numpy().reshape(-1, 1)).astype(int)
    disc_test[col] = kbin.transform(disc_test[col].to_numpy().reshape(-1, 1)).astype(int)
    kbins[col] = kbin
    
    # add intervals for readability for train
    lower_limit = kbins[col].bin_edges_[0][disc_train[col]]
    upper_limit = kbins[col].bin_edges_[0][disc_train[col] + 1]

    lower_limit = [str(int(x)) for x in lower_limit]
    upper_limit = [str(int(x)) for x in upper_limit]
    limits = np.array([lower_limit[i] + '-' + upper_limit[i] for i in range(len(upper_limit))]).reshape(-1, 1)
    disc_train[col] = limits
    
    # add intervals for readability for test
    lower_limit = kbins[col].bin_edges_[0][disc_test[col]]
    upper_limit = kbins[col].bin_edges_[0][disc_test[col] + 1]
    
    lower_limit = [str(int(x)) for x in lower_limit]
    upper_limit = [str(int(x)) for x in upper_limit]
    limits = np.array([lower_limit[i] + '-' + upper_limit[i] for i in range(len(upper_limit))]).reshape(-1, 1)
    disc_test[col] = limits

In [13]:
for col in cont_columns:
    print("Train " + col, np.unique(disc_train[col]))
    print("Test " + col, np.unique(disc_test[col]))

Train age ['17-31' '31-46' '46-60' '60-75' '75-90']
Test age ['17-31' '31-46' '46-60' '60-75' '75-90']
Train hours-per-week ['1-20' '20-40' '40-59' '59-79' '79-99']
Test hours-per-week ['1-20' '20-40' '40-59' '59-79' '79-99']


In [14]:
disc_train.to_csv("datasets/adult/train.data", index=False)
disc_train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,31-46,Government,Bachelors,Single,White-Collar,White,Male,20-40,<=50K
1,46-60,Self-Employed,Bachelors,Married,White-Collar,White,Male,1-20,<=50K
2,31-46,Private,HS-grad,Divorced,Blue-Collar,White,Male,20-40,<=50K
3,46-60,Private,11th,Married,Blue-Collar,Black,Male,20-40,<=50K
4,17-31,Private,Bachelors,Married,Professional,Black,Female,20-40,<=50K


In [15]:
disc_test.to_csv("datasets/adult/test.data", index=False)
disc_test.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,sex,hours-per-week,outcome
0,17-31,Private,11th,Single,Blue-Collar,Black,Male,20-40,<=50K.
1,31-46,Private,HS-grad,Married,Blue-Collar,White,Male,40-59,<=50K.
2,17-31,Government,Assoc-acdm,Married,Service,White,Male,20-40,>50K.
3,31-46,Private,Some-college,Married,Blue-Collar,Black,Male,20-40,>50K.
4,17-31,Other/Unknown,Some-college,Single,Other/Unknown,White,Female,20-40,<=50K.


### Train Logistic Regression

In [16]:
# transform categorialca variables into dummy variables
categorical_columns = ["age", "workclass", "marital-status", "occupation", "race", "sex", 
                       'education', "hours-per-week"]
ord_enc = OrdinalEncoder()

disc_train.loc[:, categorical_columns] = ord_enc.fit_transform(disc_train[categorical_columns]).astype(int)
disc_train = pd.get_dummies(disc_train, prefix=categorical_columns, columns=categorical_columns)
disc_train.loc[:, "outcome"] =  (disc_train["outcome"] == " >50K").astype(np.int)
disc_train.head()

Unnamed: 0,outcome,age_0,age_1,age_2,age_3,age_4,workclass_0,workclass_1,workclass_2,workclass_3,...,education_11,education_12,education_13,education_14,education_15,hours-per-week_0,hours-per-week_1,hours-per-week_2,hours-per-week_3,hours-per-week_4
0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
# separate features from outcomes
features_train = disc_train.drop(columns=["outcome"], axis=1)
outcome_train = disc_train["outcome"]
X_train, y_train = features_train.to_numpy(), outcome_train.to_numpy()

# train classifier
# clf = LogisticRegression(solver='lbfgs', max_iter=10000)
# clf = MLPClassifier(hidden_layer_sizes=64, max_iter=10000)
# clf = SVC(kernel='rbf', gamma='auto', probability=True)
clf = SVC(kernel='linear', probability=True)
scores = cross_validate(clf, X_train, y_train, cv=5, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
scores

{'fit_time': array([318.73951316, 315.56030774, 327.08639455, 291.24728608,
        289.77257085]),
 'score_time': array([ 6.77953267,  7.45691776,  6.01614189, 10.85619903,  9.76600099]),
 'test_accuracy': array([0.8180562 , 0.82693489, 0.82539926, 0.82939189, 0.82324939]),
 'test_roc_auc': array([0.87077879, 0.86693705, 0.87527708, 0.87899668, 0.87610653])}

kernel = 'rbf'
{'fit_time': array([345.36684418, 320.6666255 , 335.77289724, 346.85523701,
        339.51656628]),
 'score_time': array([10.94340038, 11.82032514, 11.35443139, 10.89360309, 11.17367387]),
 'test_accuracy': array([0.8172885 , 0.82570639, 0.82693489, 0.83077396, 0.82708845]),
 'test_roc_auc': array([0.87193643, 0.86836375, 0.87636671, 0.88104178, 0.87681265])}

### Calibration ????

In [18]:
# disc_test.loc[:, categorical_columns] = ord_enc.transform(disc_test[categorical_columns]).astype(int)
# disc_test = pd.get_dummies(disc_test, prefix=categorical_columns, columns=categorical_columns)
# disc_test.loc[:, "outcome"] =  (disc_test["outcome"] == " >50K.").astype(np.int)
# disc_test.head()

In [19]:
# features_test = disc_test.drop(columns=["outcome"], axis=1)
# outcome_test = disc_test["outcome"]

# X_test, y_test = features_test.to_numpy(), outcome_test.to_numpy()

# clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None)
# clf.fit(X_train, y_train)
# calib_clf = CalibratedClassifierCV(clf, method='sigmoid', cv='prefit').fit(X_test, y_test)

# prob_pos = calib_clf.predict_proba(X_train)[:, 1]
# # prob_pos = clf.predict_proba(X_train)[:, 1]
# fraction_of_positives, mean_predicted_value = calibration_curve(y_train, prob_pos, n_bins=10)

# fig = plt.figure(0, figsize=(10, 10))
# ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
# ax2 = plt.subplot2grid((3, 1), (2, 0))
# ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
# ax1.plot(mean_predicted_value, fraction_of_positives, "s-")
# ax2.hist(prob_pos, range=(0, 1), bins=10, histtype="step", lw=2)
# plt.tight_layout()