In [1]:
# import pandas as pd
import numpy as np
import tensorflow as tf
import csv

from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import random

train_data_file = './data/train.csv'
validation_data_file = './data/test.csv'

### OUTPUT FIle Name
OUTPUT_FILE_NAME = "tensor_flow_output_pred.csv"

### Test Split Settings
VALIDATION_SPLIT = 0.3

### Neural Network Settings
N_NEURONS_LAYER_2 = 300
N_NEURONS_LAYER_3 = 500
DROP_OUT_RATE = 0.6

### Trainning Settings
LEARNING_RATE = 0.0001
N_EPOCHS = 100000
TRAINNING_BATCH_SIZE = 32

### Verbose Options
VERBOSE = True
VERBOSE_EACH_N_EPOCHS = 1000


In [2]:
def randomize_data_obs(dataset, labels, n_obs):
    permutation = np.random.permutation(labels.shape[0])
    permutation = permutation[0 : n_obs]
    shuffled_dataset = dataset[permutation, :]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels

In [3]:
def neural_network_model(X, keep_rate, weights, biases):
    hiden_layer_1 = tf.matmul(X, weights['hiden_layer_1'])
    hiden_layer_1 = tf.add(hiden_layer_1, biases['hiden_layer_1'])
    hiden_layer_1 = tf.nn.relu(hiden_layer_1)
    
    hiden_layer_2 = tf.matmul(hiden_layer_1, weights['hiden_layer_2'])
    hiden_layer_2 = tf.add(hiden_layer_2, biases['hiden_layer_2'])
    hiden_layer_2 = tf.nn.relu(hiden_layer_2)
    
    hiden_layer_3 = tf.matmul(hiden_layer_2, weights['hiden_layer_3'])
    hiden_layer_3 = tf.add(hiden_layer_3, biases['hiden_layer_3'])
    hiden_layer_3 = tf.nn.relu(hiden_layer_3)
    
    output_layer = tf.matmul(hiden_layer_3, weights['output_layer'])
    output_layer = tf.add(output_layer, biases['output_layer'])
    
    return output_layer


In [4]:
def train_neural_network(train_dataset, train_labels, 
                         predictioon_data_set = None,
                         validation_split = VALIDATION_SPLIT,
                         n_neuron_layer_2 = N_NEURONS_LAYER_2,
                         n_neuron_layer_3 = N_NEURONS_LAYER_3,
                         keep_prob = DROP_OUT_RATE,
                         n_epochs = N_EPOCHS, batch_size = TRAINNING_BATCH_SIZE,
                         verbose = VERBOSE, n_verbose_epoch = VERBOSE_EACH_N_EPOCHS):
    n_neuron_layer_1 = train_dataset.shape[0]
    train_dataset, test_dataset, train_labels, test_labels = train_test_split( train_dataset,
                                                                               train_labels,
                                                                               test_size = VALIDATION_SPLIT)
                
    N_PREDICTORS = train_dataset.shape[1]
    
    X = tf.placeholder("float", [None, N_PREDICTORS])
    Y = tf.placeholder("float", [None])
    keep_rate = tf.placeholder(tf.float32)
    
    weights = {'hiden_layer_1':tf.Variable(tf.random_normal([N_PREDICTORS, n_neuron_layer_1], 0, 0.1)),
               'hiden_layer_2':tf.Variable(tf.random_normal([n_neuron_layer_1, n_neuron_layer_2], 0, 0.1)),
               'hiden_layer_3':tf.Variable(tf.random_normal([n_neuron_layer_2, n_neuron_layer_3], 0, 0.1)),
               'output_layer':tf.Variable(tf.random_normal([n_neuron_layer_3, 1], 0, 0.1))}

    biases = { 'hiden_layer_1':tf.Variable(tf.random_normal([n_neuron_layer_1], 0, 0.1)),
               'hiden_layer_2':tf.Variable(tf.random_normal([n_neuron_layer_2], 0, 0.1)),
               'hiden_layer_3':tf.Variable(tf.random_normal([n_neuron_layer_3], 0, 0.1)),
               'output_layer':tf.Variable(tf.random_normal([1], 0, 0.1))}
    
    prediction = neural_network_model(X, keep_rate, weights, biases)
    cost = tf.reduce_mean( tf.sqrt(tf.square(tf.transpose(prediction) - Y)) )
    optimizer = tf.train.AdamOptimizer(learning_rate = LEARNING_RATE).minimize(cost)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(n_epochs+1):
            
            epoch_x, epoch_y = randomize_data_obs(train_dataset, train_labels, batch_size)
            _, c, p = sess.run([optimizer, cost, prediction], feed_dict = {X: train_dataset,
                                                                           Y: train_labels,
                                                                           keep_rate: keep_prob} )
                               
            if int(epoch % n_verbose_epoch)  == 0 and verbose == True:
                training_cost = sess.run([cost], feed_dict = {X: train_dataset,
                                                              Y: train_labels,
                                                              keep_rate: 1})
                test_cost = sess.run([cost], feed_dict = {X: test_dataset,
                                                          Y: test_labels,
                                                          keep_rate: 1})       
                print("EPOCH", str(epoch), ":",
                      "Training Cost=", str(training_cost),
                      "Test Cost", str(test_cost)
                     )
         
        if type(predictioon_data_set) != type(None):
            predictioon_labels = sess.run(prediction, feed_dict = {X: predictioon_data_set, keep_rate: 1})
            return predictioon_labels

In [5]:
train_data = np.genfromtxt(train_data_file, delimiter=',', skip_header = 1)
validation_data = np.genfromtxt(validation_data_file, delimiter=',', skip_header = 1)

imp = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 1)
imp.fit(train_data)

train_data = imp.transform(train_data)
validation_data = imp.transform(validation_data)

validation_data_ID = validation_data[:, 0]
validation_data = validation_data[:, 1:]

train_labels = train_data[:, -1] ### The Target Column
train_data = train_data[:, 1:-1] ### All but Id and Target Columns


In [None]:
validation_pred_labels = train_neural_network(train_dataset = train_data,
                                              train_labels = train_labels, 
                                              predictioon_data_set = validation_data)

EPOCH 0 : Training Cost= [11645292.0] Test Cost [23454788.0]
c = 3.39039e+07
EPOCH 1000 : Training Cost= [1772484.5] Test Cost [3418767.2]
c = 1.1165e+06
EPOCH 2000 : Training Cost= [259201.28] Test Cost [359450.25]
c = 1.21221e+06
EPOCH 3000 : Training Cost= [884508.44] Test Cost [1667009.0]
c = 1.16306e+06
EPOCH 4000 : Training Cost= [678111.19] Test Cost [1225236.0]
c = 531184.0
EPOCH 5000 : Training Cost= [608271.25] Test Cost [1056551.5]
c = 343416.0
EPOCH 6000 : Training Cost= [340289.44] Test Cost [557356.69]
c = 143521.0
EPOCH 7000 : Training Cost= [301044.62] Test Cost [741412.31]
c = 173582.0
EPOCH 8000 : Training Cost= [280520.09] Test Cost [431163.22]
c = 119424.0
EPOCH 9000 : Training Cost= [178124.14] Test Cost [215952.84]
c = 584557.0
EPOCH 10000 : Training Cost= [351565.09] Test Cost [844495.81]
c = 156801.0
EPOCH 11000 : Training Cost= [333422.22] Test Cost [788121.06]
c = 135355.0
EPOCH 12000 : Training Cost= [174298.61] Test Cost [230147.56]
c = 92786.7
EPOCH 13000 :

In [None]:
submissions = pd.DataFrame({'Id': validation_data_ID.astype(int),
                            'Target': validation_pred_labels.reshape([1, -1])[0]})
submissions.to_csv( OUTPUT_FILE_NAME, 
                    index = False,
                    header = True,
                    quoting = csv.QUOTE_NONNUMERIC,
                    quotechar = '\"')