In [14]:
from sklearn import preprocessing
import csv
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def get_data(path):
    data = []
    with open(path, 'r') as csvfile:
        raw_data = csv.reader(csvfile, delimiter = ',')
        i = 0
        for row in raw_data:
            data.append([])
            for element in row:
                data[i].append(element)
            i += 1
    return data

def extractContinuousData(data):
    output = []
    continuous_features = [0, 2, 4, 10, 11, 12]
    output = [[data[i][j] for j in range(0, len(data[i])) if j in continuous_features] for i in range(0, len(data))]
    return output

def extractLabels(data):
    output = []
    label_column = 14
    output = [[data[i][j] for j in range(0, len(data[i])) if j == label_column] for i in range(0, len(data))]
    return output

def scaleData(data):
    npArr = np.array(data)
    return preprocessing.scale(npArr)

def exportDataToCSV(filename, data):
    np.savetxt(filename, data.astype(float), fmt='%f', delimiter=",")

def interpretLabels(data):
    labels = [interpreter(e) for e in data]
    return np.array(labels)

def interpreter(value):
    if value[0] == " <=50K":
        return -1
    elif value[0] == " >50K":
        return 1
    else:
        #   Error Case
        return 0

def load_data():
    orig_train_data = get_data("data/train.txt")
    orig_test_data = get_data("data/test.txt")

    extract_train_data = extractContinuousData(orig_train_data)
    extract_test_data = extractContinuousData(orig_test_data)

    scaled_train_data = scaleData(extract_train_data)
    scaled_test_data = scaleData(extract_test_data)

    print("Mean by Feature: ", scaled_train_data.mean(axis=0))
    print("Variance by Feature: ", scaled_train_data.std(axis=0))

    extract_train_label = extractLabels(orig_train_data)
    interpreted_labels = interpretLabels(extract_train_label)

    formatted_labels = np.transpose(np.array([interpreted_labels]))
    s_train_and_label_data = np.append(scaled_train_data, formatted_labels , axis=1)
    return (s_train_and_label_data).astype(float),(scaled_test_data).astype(float)

    #exportDataToCSV("./adult-dataset/scaled_train_and_label_data.csv", s_train_and_label_data)
    #exportDataToCSV("./adult-dataset/scaled_test_data.csv", scaled_test_data)

def printFirst10(data):
    for i in range(0, 10):
        print(data[i])


def split_data(data):
    np.random.shuffle(data)
    n = len(data)
    chunk_len = n // 10
    eval_data = data[0:chunk_len]
    train_data = data[chunk_len: n]
    return (eval_data, train_data)

        
scaled_train_data,scaled_final_test_data=load_data()
splitted_train_data = split_data(scaled_train_data)

Mean by Feature:  [-6.26879347e-17  8.82720377e-17 -4.20945330e-16  2.80602385e-15
 -4.10941765e-15 -2.32263597e-16]
Variance by Feature:  [1. 1. 1. 1. 1. 1.]
