### Imports

In [121]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.utils import np_utils

from sklearn import cross_validation
from sklearn.metrics import accuracy_score

### Get Data

In [29]:
df = pd.read_csv("Adult_Census_Income_Binary_Classification_dataset.csv")

### Fix Dataset

In [30]:
cat_columns = []
for i in [1,2,4,5,6,7,8,12,13]:
    cat_columns.append(df.columns[i])

#### handle missing data

In [86]:
df = df.replace(" ?", pd.NaT)
df.dropna(inplace=True)

In [87]:
for i in cat_columns:
    df[i] = df[i].astype('category')

#### handle categoricle features

In [88]:
for i in cat_columns:
    df[i] = pd.Categorical(df[i]).codes

In [89]:
df.tail()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
32556,27,3,7,12,2,12,5,4,0,0,0,38,38,0
32557,40,3,11,9,2,6,0,4,1,0,0,40,38,1
32558,58,3,11,9,6,0,4,4,0,0,0,40,38,0
32559,22,3,11,9,4,0,3,4,1,0,0,20,38,0
32560,52,4,11,9,2,3,5,4,0,15024,0,40,38,1


In [90]:
X = np.zeros((13, len(df)))

In [91]:
for i in range(len(df.columns)-1):
    X[i] = np.array(df[df.columns[i]])

In [93]:
X = X.transpose()

In [103]:
y = np.array(df[df.columns[13]])

In [203]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

## k-nearest neighbors

In [204]:
knn = KNeighborsClassifier()

### Train model

In [205]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

### Accuracy

In [206]:
knn.score(X_test, y_test)

0.8386304314448026

In [207]:
y_predict = knn.predict(X_test)
accuracy_score(y_predict, y_test)

0.8386304314448026

In [208]:
best_acc = 0.0
best_knn = KNeighborsClassifier()

In [209]:
def best_model(trials=5):
    global best_acc
    global best_knn
    for i in range(trials):
        best_knn = KNeighborsClassifier()
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
        best_knn.fit(X_train, y_train)
        acc = best_knn.score(X_test, y_test)
        if (best_acc<acc):
            best_acc = acc
            best_knn = knn
            print(best_acc)
    return best_knn, best_acc

In [211]:
#best_model(100)

## Logistic Regression

In [212]:
batch_size = 88
output_classes = 2
epochs = 100

In [213]:
y_train = np_utils.to_categorical(y_train, output_classes)
y_test = np_utils.to_categorical(y_test, output_classes)

In [214]:
y_train

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [215]:
#### M

In [225]:
model = Sequential()
model.add(Dense(2, input_dim=(13), kernel_initializer='normal', activation='softmax'))
model.compile(optimizer=SGD(lr=0.05), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 2)                 28        
Total params: 28
Trainable params: 28
Non-trainable params: 0
_________________________________________________________________


In [226]:
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/1


In [227]:
evaluation = model.evaluate(X_test, y_test, verbose=1)



In [228]:
evaluation[1]

0.7764471057884231

## Neural Network

In [229]:
n_layer1 = 13
n_layer2 = 20
n_layer3 = 20

In [230]:
model = Sequential()

In [234]:
model.add(Dense(n_layer2, input_dim=n_layer1, activation='relu'))
model.add(Dense(n_layer3, activation='relu'))
model.add(Dense(output_classes, activation='softmax'))

In [235]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [236]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1b2971ef588>

In [237]:
evaluation = model.evaluate(X_test, y_test, verbose=1)



In [238]:
evaluation[1]

0.7953324121263341