In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential

In [63]:
class Data:
    # Load Adult dataset and seperate to features(X) and target(y)
    def __init__(self, path='data/adult.csv'):
        df = shuffle(pd.read_csv(path))
        df = self.clean(df)

        self.y = df.pop('income')
        self.X = df
        
        # Label encode y
        self.y_encoder = LabelEncoder()
        self.y = self.y_encoder.fit_transform(self.y)
        
        # One Hot encode X
        self.X = pd.get_dummies(self.X)
        
        for name in self.X.columns:
            if self.X[name].dtype == 'object':
                self.X[name] = self.X[name].astype('category')
    
    def clean(self, df):
        return df.replace('?', np.nan).dropna().drop('fnlwgt', axis=1)


    def train_test_split(self):
        return train_test_split(self.X, self.y, test_size=0.15)


In [55]:
class TrainingModel:
    def __init__(self, input_shape):
        self.model = Sequential()
        self.model.add(Dense(64, activation='relu', input_shape=input_shape))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(128, activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(128, activation='relu'))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

    def train(self, data, label):
        self.model.fit(data, label, epochs=10, batch_size=128)
    
    def predict(self, data):
        return self. model.predict_classes(data)
    
    def evalute_and_print_report(self, X_test, y_test):
        y_predicted_base = base_model.predict(X_test)
        y_predicted_probs_base = base_model.model.predict_proba(X_test)
        base_model.report(y_test, y_predicted_base, y_predicted_probs_base)

    def report(self, test, predicted, predicted_probs):
        print('Accuracy score: {:.5f}'.format(accuracy_score(test, predicted)))
        print('-' * 20)
        print('Confusion Matrix:')
        print(confusion_matrix(test, predicted))
        print('-' * 20)
        print(classification_report(test, predicted))
        print('-' * 20)
        print('AUC score: {:.5f}'.format(roc_auc_score(test, predicted_probs)))

In [64]:
data = Data()

In [65]:
X_train_origin, X_test_origin, y_train_origin, y_test_origin = data.train_test_split()

# Train a base model

In [58]:
base_model = TrainingModel((103, ))

In [59]:
base_model.train(X_train_origin, y_train_origin)

Train on 38438 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [60]:
base_model.evalute_and_print_report(X_test_origin, y_test_origin)

Accuracy score: 0.80793
--------------------
Confusion Matrix:
[[4474  605]
 [ 698 1007]]
--------------------
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      5079
           1       0.62      0.59      0.61      1705

    accuracy                           0.81      6784
   macro avg       0.74      0.74      0.74      6784
weighted avg       0.80      0.81      0.81      6784

--------------------
AUC score: 0.79326


In [62]:
X_train_origin.columns[0]

'age'