## Import libraries

In [1]:
"""
Notebook that runs a neutral network on the adult and student performance dataset without annoymisation.
Supposed to be a benchmark for the anonimised data sets.
"""
# import libraries
import pandas as pd
import numpy as np
import keras
import os

# import modules
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from keras.models import Sequential
from keras.layers import Dense

## Preprocess data

In [2]:
"""
Please note that the data has been cleaned (=no missing/NaN values) in advance.
"""

# load data
adult_train = pd.read_csv('/Users/pepijndereus/Desktop/Thesis/Data/Adult/Adult_train.csv')
adult_val = pd.read_csv('/Users/pepijndereus/Desktop/Thesis/Data/Adult/Adult_val.csv')

def preprocess_dataset(adult):
    # make binary labels for income column
    adult['income'] = adult['income'].str.replace('<=50K', '0')
    adult['income'] = adult['income'].str.replace('>50K', '1')
    adult['income'] = adult['income'].astype(int)

    # make array with labels, remove labels from dataframe
    labels = adult['income'].copy()
    labels = np.array(labels)
    adult = adult.drop(['income'], axis=1)

    # use Min-max scaling for continuous features
    adult[['age','capital_gain','capital_loss','hr_per_week']] = MinMaxScaler().fit_transform(adult[['age','capital_gain','capital_loss','hr_per_week']])

    # use One-hot encoding for categorial features
    adult = pd.get_dummies(adult,columns = ['type_employer','education','marital','occupation','relationship','race','sex','country'])
    
    return adult, labels

# apply preprocessing to training and validation set
adult_train, labels_train = preprocess_dataset(adult_train)
adult_val, labels_val = preprocess_dataset(adult_val)

set(adult_train.columns).difference(adult_val.columns)

# since only 1 entry for entire set, remove this column
adult_train.drop('country_Holand-Netherlands', axis=1, inplace=True)

## Run the neural network

In [3]:
"""
Now that the data is preprocessed for the neural network, we run the neural net.
"""

# create neural network with keras sequential model
NeuralNet = Sequential()

# add 3 layers, one input, one output and one hidden layer
NeuralNet.add(Dense(6, activation = 'relu'))
NeuralNet.add(Dense(6, activation = 'relu'))
NeuralNet.add(Dense(1, activation = 'sigmoid'))
NeuralNet.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# train the network with training set and training labels
print("Training the neural network..\n")                   
NeuralNet.fit(adult_train, labels_train, batch_size = 10, epochs = 20, verbose=0)

# validate the network with the validation set and labels
print("\nPrediction accuracy:\n")
predication = NeuralNet.predict(adult_val)

# print the accuracy
loss, accuracy = NeuralNet.evaluate(adult_val, labels_val)
print(f"\nThe loss is {loss}, and the accuracy is {accuracy}")

Training the neural network..


Prediction accuracy:


The loss is 0.32756349444389343, and the accuracy is 0.8458831310272217


## References
The code above was created with help of the following sources:

* https://www.kaggle.com/code/ritvikkhanna09/simple-neural-networks-using-keras/notebook#Training-the-Neural-Network , accessed at 13-04-2022

* https://github.com/DeepakGunturu/Adult-Dataset-Classification/blob/main/classifiers.py , accessed at 11-04-2022