In [1]:
#Importando pacotes

import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup
import math

In [2]:
#Preparação dos dados

#Labels das colunas
CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

#Importe dados de treino
# train_data_url = (
#     "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# )

train_data_url = 'dados/train_data.csv'

train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)

#Importe dados de teste
# test_data_url = (
#     "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
# )

test_data_url = 'dados/test_data.csv'

test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)

print(f"Dataset de treino shape: {train_data.shape}")
print(f"Dataset de teste shape: {test_data.shape}")

Dataset de treino shape: (32561, 15)
Dataset de teste shape: (16281, 15)


In [3]:
train_data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
#Removendo primeira observação, dados inválido
test_data = test_data[1:]

#Removendo ponto da coluna income_bracket
train_data.income_bracket = train_data.income_bracket.apply(lambda x: x.replace(".", ""))

test_data.income_bracket = train_data.income_bracket.apply(lambda x: x.replace(".", ""))

#Gerando arquivo csv local

train_data_file = "dados/train_data.csv"
test_data_file = "dados/test_data.csv"

train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)

In [5]:
# Lista de nome das variáveis numéricas

NUMERIC_FEATURE_NAMES = [
    "age",
    "education_num",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
]

In [6]:
# Dicionário das variáveis categóricas

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "workclass": sorted(list(train_data["workclass"].unique())),
    "education": sorted(list(train_data["education"].unique())),
    "marital_status": sorted(list(train_data["marital_status"].unique())),
    "occupation": sorted(list(train_data["occupation"].unique())),
    "relationship": sorted(list(train_data["relationship"].unique())),
    "race": sorted(list(train_data["race"].unique())),
    "gender": sorted(list(train_data["gender"].unique())),
    "native_country": sorted(list(train_data["native_country"].unique())),
}

CATEGORICAL_FEATURES_WITH_VOCABULARY

{'workclass': [' ?',
  ' Federal-gov',
  ' Local-gov',
  ' Never-worked',
  ' Private',
  ' Self-emp-inc',
  ' Self-emp-not-inc',
  ' State-gov',
  ' Without-pay'],
 'education': [' 10th',
  ' 11th',
  ' 12th',
  ' 1st-4th',
  ' 5th-6th',
  ' 7th-8th',
  ' 9th',
  ' Assoc-acdm',
  ' Assoc-voc',
  ' Bachelors',
  ' Doctorate',
  ' HS-grad',
  ' Masters',
  ' Preschool',
  ' Prof-school',
  ' Some-college'],
 'marital_status': [' Divorced',
  ' Married-AF-spouse',
  ' Married-civ-spouse',
  ' Married-spouse-absent',
  ' Never-married',
  ' Separated',
  ' Widowed'],
 'occupation': [' ?',
  ' Adm-clerical',
  ' Armed-Forces',
  ' Craft-repair',
  ' Exec-managerial',
  ' Farming-fishing',
  ' Handlers-cleaners',
  ' Machine-op-inspct',
  ' Other-service',
  ' Priv-house-serv',
  ' Prof-specialty',
  ' Protective-serv',
  ' Sales',
  ' Tech-support',
  ' Transport-moving'],
 'relationship': [' Husband',
  ' Not-in-family',
  ' Other-relative',
  ' Own-child',
  ' Unmarried',
  ' Wife'],
 'r

In [7]:
# Lista de colunas ignoradas no dataset

IGNORE_COLUMN_NAMES = ["fnlwgt"]

In [8]:
# Lista de nome das variáveis categóricas
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

CATEGORICAL_FEATURE_NAMES

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'native_country']

In [9]:
# Lista de todas as variáveis do dataset

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

In [10]:
# Atribuindo valores default para as colunas

COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
    for feature_name in CSV_HEADER
]

In [11]:
# Definindo variável target
TARGET_FEATURE_NAME = "income_bracket"

# Lista de labes da variável target
TARGET_LABELS = [" <=50K", " >50K"]

In [12]:
# Função para carregar dataset

#Converte string em númericos (0,1)
target_label_lookup = StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(lambda features, target: (features, target_label_lookup(target)))
    
    return dataset.cache()

In [13]:
# Função para converter os tipos de dados

def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

In [14]:
# Função para codificar os dados de entrada

def encode_inputs(inputs):
    
    #Cria lista vazia
    encoded_features = []
    
    #Loop para converter valores strings em índices inteiros
    for feature_name in inputs:
        
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            
            #Coleta os valores
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            
            #Cria objeto Lookup
            lookup = StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            
            # Converte entradas strings em índices inteiros
            value_index = lookup(inputs[feature_name])
            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
            
            # Cria uma camada oculta com dimensão especificada
            embedding = layers.Embedding(
                input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
            )
            
            #Converte os valores dos indices para as camadas ocultas
            encoded_feature = embedding(value_index)
        
        else:
            
            # Utilização dos valores númericos das variáveis
            encoded_feature = inputs[feature_name]
            if inputs[feature_name].shape[-1] is None:
                encoded_feature = tf.expand_dims(encoded_feature, -1)
                
        
        #Anexa os indices codificados
        encoded_features.append(encoded_feature)
    
    #Cria as camadas concatenadas
    encoded_features = layers.concatenate(encoded_features)
    
    return encoded_features

In [15]:
# Deep Neural decision tree

class NeuralDecisionTree(keras.Model):
    
    def __init__(self, depth, num_features, used_features_rate, num_classes):
        
        super(NeuralDecisionTree, self).__init__()
        self.depth = depth
        self.num_leaves = 2 ** depth
        self.num_classes = num_classes

        # Cria a máscara para a seleção das features aletoriamente
        num_used_features = int(num_features * used_features_rate)
        one_hot = np.eye(num_features)
        sampled_feature_indicies = np.random.choice(
            np.arange(num_features), num_used_features, replace=False
        )
        self.used_features_mask = one_hot[sampled_feature_indicies]

        # Inicializa o peso das classes
        self.pi = tf.Variable(
            initial_value=tf.random_normal_initializer()(
                shape=[self.num_leaves, self.num_classes]
            ),
            dtype="float32",
            trainable=True,
        )

        # Define a função de ativação
        self.decision_fn = layers.Dense(
            units=self.num_leaves, activation="sigmoid", name="decision"
        )

    def call(self, features):
        batch_size = tf.shape(features)[0]

        # Aplica a máscara das features ao dados de entrada
        features = tf.matmul(
            features, self.used_features_mask, transpose_b=True
        )
        
        # Calcula as probabilidades
        decisions = tf.expand_dims(
            self.decision_fn(features), axis=2
        )
        
        # Concatena as probabilidades de roteamento com seus complementos
        decisions = layers.concatenate(
            [decisions, 1 - decisions], axis=2
        )

        mu = tf.ones([batch_size, 1, 1])
        
        #Inicializa indices
        begin_idx = 1
        end_idx = 2
        
        # Cria a árvore de decisão
        for level in range(self.depth):
            mu = tf.reshape(mu, [batch_size, -1, 1])  # [batch_size, 2 ** level, 1]
            mu = tf.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
            level_decisions = decisions[
                :, begin_idx:end_idx, :
            ]  # [batch_size, 2 ** level, 2]
            mu = mu * level_decisions  # [batch_size, 2**level, 2]
            begin_idx = end_idx
            end_idx = begin_idx + 2 ** (level + 1)

        mu = tf.reshape(mu, [batch_size, self.num_leaves])  # [batch_size, num_leaves]
        
        probabilities = keras.activations.softmax(self.pi)  # [num_leaves, num_classes]
        
        outputs = tf.matmul(mu, probabilities)  # [batch_size, num_classes]
        
        return outputs

In [16]:
# Deep Neural decision forest

# Modelo neural decision foresst consiste em um conjunto de árvores de decisão treinados simultaneamente.
# A acurácia do modelo é a acurácia média de todas as árvores

class NeuralDecisionForest(keras.Model):
    
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
        
        super(NeuralDecisionForest, self).__init__()
        self.ensemble = []
        
        # Inicializa o ensemble pela adição das instâncias NeuralDecisionTree
        for _ in range(num_trees):
            self.ensemble.append(
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
            )

    def call(self, inputs):
        
        # inicializa as saídas: a [batch_size, num_classes] matriz de zeros.
        batch_size = tf.shape(inputs)[0]
        outputs = tf.zeros([batch_size, num_classes])

        # Afrega a saída das árvores no ensemble.
        for tree in self.ensemble:
            outputs += tree(inputs)
            
        # Média da saída de todas as árvores
        outputs /= len(self.ensemble)
        
        return outputs

In [27]:
# Treinamento e avaliação do modelo

#Parâmentros de treino
learning_rate = 0.01
batch_size = 265
num_epochs = 50
hidden_units = [64, 64]


def run_train(model):

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    print("Iniciando treinamento do modelo...")
    train_dataset = get_dataset_from_csv(
        train_data_file, shuffle=True, batch_size=batch_size
    )

    model.fit(train_dataset, epochs=num_epochs)
    print("Treinamento do modelo finalizado")

    print("Avaliação do modelo nos dados de teste...")
    test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)

    _, accuracy = model.evaluate(test_dataset)
    print(f"Acurácia: {round(accuracy * 100, 2)}%")

In [28]:
# Treinamento decision tree model

# Parâmentros da árvore
num_trees = 10
depth = 5
used_features_rate = 1.0
num_classes = len(TARGET_LABELS)


def create_tree_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]

    tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)

    outputs = tree(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [29]:
%%time
# Inicia treinamento
tree_model = create_tree_model()
run_train(tree_model)

Iniciando treinamento do modelo...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Treinamento do modelo finalizado
Avaliação do modelo nos dados de teste...
Acurácia: 64.82%
Wall time: 1min 18s


In [22]:
# Treinamento forest model

num_trees = 25
depth = 5
used_features_rate = 0.5


def create_forest_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]

    forest_model = NeuralDecisionForest(
        num_trees, depth, num_features, used_features_rate, num_classes
    )

    outputs = forest_model(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [23]:
%%time
# Inicia treinamento
forest_model = create_forest_model()

run_train(tree_model)

Iniciando treinamento do modelo...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Treinamento do modelo finalizado
Avaliação do modelo nos dados de teste...
Acurácia: 64.99%
Wall time: 18.2 s
