In [10]:
import csv
import numpy as np

def gamma(a, b, features):
    LABEL_IDX = 6
    return np.transpose(a).dot(features) + b

def hinge_loss(a,b, example):
    LABEL_IDX = 6
    return max(0, 1 - example[LABEL_IDX] * gamma(a, b, example))

def penalty_term(a):
    return 1/2 * np.asscalar(np.transpose(a).dot(a))

#   Calculate Gradient using formulas in CS 498 AML Textbook page 38
def gradient_r(u, lam, example):
    LABEL_IDX = 6
    features = np.array(example[0:LABEL_IDX])
    label = example[LABEL_IDX]
    a = np.array(u.tolist()[:-1])
    b = np.array(u.tolist()[-1])
    diff =  label * gamma(a, b, features)

    a_deltas = np.array([])
    b_delta = 0
    #   Calculate Gradient
    if diff >= 1:
        a_deltas = lam * a
        b_delta = 0

    else:
        a_deltas = np.subtract(lam * a, (label * features).reshape(6, 1))
        b_delta = -label

    gradient = (np.append(np.array(a_deltas), np.array([b_delta]))).reshape(7, 1)
    return gradient

def cost_function(u, lam, training_data):
    LABEL_IDX = 6
    a = u[:-1]
    b = u[-1]
    N = len(training_data)
    temp_sum = 0
    count = 0
    for e in training_data:
        count += 1
        features = np.transpose(np.array([e[0:LABEL_IDX]]))
        label = e[LABEL_IDX]
        g = gamma(a, b, features)
        error = 1 - label * g
        temp_sum += max(0, np.asscalar(error))

    return 1/N * temp_sum + lam * penalty_term(a)

def classify(u, features):
    a = u[:-1]
    b = u[-1]
    g = gamma(a, b, features)
    out = np.sign(g)[0]
    return out

#   Learning Rate (Step Length Taken From CS 498 AML Textbook Page 39)
def l_rate(epoch):
    return 1 / (0.01 * epoch + 50)

#   Trains SVM Classifier
#   returns : (optimized free parameters u, costs)
def train_model(u, lam, l_rate, train_data, epochs=50, steps=300, ex_per_epoch=50, log=False, log_filename="", eval_data=[]):

    #   Choose 50 Random Training Examples
    epoch_examples = choose_random_data(train_data, examples=ex_per_epoch)
    costs = []
    counter = 1
    for i in range(0, epochs):
        for j in range(0, steps):
            #   Calculate Gradient
            gradient = gradient_r(u, lam, epoch_examples[0])
            for i in range(0, ex_per_epoch):
                gradient += gradient_r(u, lam, epoch_examples[i])
            gradient /= ex_per_epoch

            #   Update model
            step = l_rate(i) * gradient
            u = np.subtract(u, step)

            #   Log every 30 steps
            counter += 1
            if (counter % 30 == 0 and log):
                logger(counter, log_filename, u, lam, eval_data)

        #   Find current cost
        cost = cost_function(u, lam, epoch_examples)
        costs.append(cost)
        epoch_examples = choose_random_data(train_data, examples=ex_per_epoch)
    return (u, costs)

#   Evaluate Model using separate data set
#   return : the models accuracy over the eval data set
def evaluate_model(u, lam, eval_data):
    num_correct = 0
    for example in eval_data:
        features = example[0:-1]
        label = example[-1]

        prediction = classify(u, features)
        if prediction == label:
            num_correct += 1
    return num_correct / len(eval_data)

#   return : lambda accuracies
def probe_lambdas(lambdas, l_rate, train_data, splits=10):
    u = np.random.rand(7,1)
    accuracies = [0 for i in range(0, len(lambdas))]
    for i in range(0, splits):
        print("Probing Lambdas - On split", i)
        splitted_train_data = split_data(train_data)
        train_data  = splitted_train_data[1]
        eval_data  = splitted_train_data[1]
        for j in range(0, len(lambdas)):
            lam = lambdas[j]

            u_copy = np.copy(u)
            optimized_u = train_model(u_copy, lam, l_rate, train_data)[0]
            accuracy = evaluate_model(optimized_u, lam, eval_data)
            accuracies[j] += accuracy

    accuracies = [e/splits for e in accuracies]
    return accuracies

def log_lambdas(lambdas, l_rate, data):
    u = np.random.rand(7,1)
    splitted_train_data = split_data(data)
    train_data  = splitted_train_data[1]
    eval_data  = splitted_train_data[1]
    for lam in lambdas:
        print("Logging", lam, "right now...")
        filename = "./results/" + str(lam) + "_logfile.csv"
        copy_u = np.copy(u)
        train_model(copy_u, lam, l_rate, train_data, epochs=50, steps=300, ex_per_epoch=50, log=True, log_filename=filename, eval_data=eval_data)

def final_eval(lam, train_data, test_data):
    u = np.random.rand(7,1)
    optimized_u = np.array(train_model(u, lam, l_rate, train_data)[0]).reshape(7,1)
    submission = open("./results/submission.csv", "a+")
    #submission.write("Example,Label\n")
    for i in range(0, len(test_data)):
        features = np.array(test_data[i]).reshape(6,1)
        prediction = classify(optimized_u, features)
        if prediction > 0:
            # >50K
            #submission.write("'"+ str(i) +"'" + ",>50K\n")
            submission.write(">50K\n")
        else:
            #" <=50K"
            submission.write("<=50K\n")
    submission.close()

def main():
    #   Retrive Data
    scaled_train_data = np.array(get_data("data/processed/scaled_train_and_label_data.csv")).astype(float)
    scaled_final_test_data = np.array(get_data("data/processed/scaled_test_data.csv")).astype(float)

    #   Split Data
    splitted_train_data = split_data(scaled_train_data)
    master_train_data = splitted_train_data[1]
    master_eval_data = splitted_train_data[0]

    #   Probes Lambda Values and prints respective accuracies into a file
    #       File format: all lambdas are listed, then all accuracies are listed
    #           Yes I know the format is pretty bad... but it works
    # lambdas_to_probe = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
    # accuracies = np.array(probe_lambdas(lambdas_to_probe, l_rate, master_train_data))
    # print(accuracies)
    # accuracies_file = np.append(np.array(lambdas_to_probe), accuracies)
    # exportDataToCSV("./results/lambdas_accuracies.csv", accuracies_file)

    #   Log Accuracy and Norm of Parameters for each lambda value
    #log_lambdas(lambdas_to_probe, l_rate, master_train_data)

    #   Create Kaggle Submissione, train using all data and best lambda value
    lam = 1e-5
    final_eval(lam, scaled_train_data, scaled_final_test_data)



#   Create a Log File
def logger(step, filename, u, lam, eval_data):
    log = open(filename, "a+")
    accuracy = evaluate_model(u, lam, eval_data)
    magnitude = np.linalg.norm(u)
    message = str(step) + "," + str(accuracy) + "," + str(magnitude) +"\n"
    log.write(message)
    log.close()

def get_data(path):
    data = []
    with open(path, 'r') as csvfile:
        raw_data = csv.reader(csvfile, delimiter = ',')
        i = 0
        for row in raw_data:
            data.append([])
            for element in row:
                data[i].append(float(element))
            i += 1
    return data

#   Splits Data into 10% / 90% chunks
#   returns : a tuple with the first element being the 10% chunk, second element
#       being the 90% chunk
def split_data(data):
    np.random.shuffle(data)
    n = len(data)
    chunk_len = n // 10
    eval_data = data[0:chunk_len]
    train_data = data[chunk_len: n]
    return (eval_data, train_data)

def choose_random_data(data, examples):
    np.random.shuffle(data)
    n = len(data)
    return data[0:examples]

def printFirst10(data):
    for i in range(0, 10):
        print(data[i])

def exportDataToCSV(filename, data):
    np.savetxt(filename, data, delimiter=",")
main()

In [11]:
pwd

'/Users/sushanta/Documents/Kaggle/Illinois/CS 598 - Applied Machine Learning/HW2'

In [13]:
scaled_train_data = np.array(get_data("data/processed/scaled_train_and_label_data.csv")).astype(float)
scaled_final_test_data = np.array(get_data("data/processed/scaled_test_data.csv")).astype(float)



In [16]:
scaled_train_data.shape

(43957, 7)