**Kaggle Competition - Santander**

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict # for the data, we don't want to have to create a new array

Reading in the data:

In [15]:
# Just flushing out buffer asap.
import sys
oldsysstdout = sys.stdout
class flushfile():
    def __init__(self, f):
        self.f = f
    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)
    def write(self, x):
        self.f.write(x)
        self.f.flush()
    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)

In [16]:
raw_data = []
train_data = defaultdict(list)
test_data = defaultdict(list)

In [17]:
def read_file(name):
    '''
    Params: name of file
    returns: tuple [keys from the first line of csv file, rest of the lines as arr]
    '''
    data_keys = []
    raw_data = []
    # Reading in the keys of the value
    with open(name, 'r') as f:
        lines = f.readlines()
        for k in lines[0].split(","):
            data_keys.append(k)
        raw_data = lines[1:]
    return [data_keys, raw_data]

In [18]:
def ingest_data(data_keys, data_arr, result, flags, flag_replace):
    '''
    Params: keys from first line, data from the rest of lines, result in the form of a dict(defaultdict allowed)
    returns: 2D ndarray
    '''
    # Taking in the data using the keys
    arr_out = []
    for row in data_arr:
        arr_in = []
        for key, dat in zip(data_keys, row.split(",")):
            num = float(dat)
            if dat in flags:
                result[key].append(flag_replace)
                arr_in.append(flag_replace)
            else:
                result[key].append(num)
                arr_in.append(num)
        arr_out.append(arr_in)
    return np.array(arr_out, dtype=np.float64)

In [19]:
train_key, raw_train_data = read_file('train.csv')
test_key, raw_test_data = read_file('test.csv')

In [20]:
train_data = ingest_data(train_key, raw_train_data, train_data, ["9999999999", "-999999"], -1)
# test_data = ingest_data(test_key, raw_test_data, test_data) Remove this line temporarily for testing

In [21]:
# Test to see the size of train_data
print(len(train_data))
print(len(test_data))
print(train_data.shape)

In [22]:
# Prune the zeros
columns_removed = np.all(train_data == 0, axis=0)
train_data = train_data[:,~columns_removed]
print(train_data.shape)

In [23]:
# split the values into X and y
train_X = train_data[:,:-1]
train_y = train_data[:,-1]
print(train_X.shape)
print(train_y[:50]) # Okay we have at least 1 here, sanity check

Data preprocessing

In [24]:
# Normalize
train_X -= np.mean(train_X, axis=0)
train_X /= np.std(train_X, axis=0)
cov = np.dot(train_X.T, train_X)/train_X.shape[0]
U,S,V = np.linalg.svd(cov)
train_X_reduced = np.dot(train_X, U[:,:100])
print(train_X_reduced.shape)

In [25]:
# Create CV test set
from sklearn.cross_validation import KFold
kf = KFold(train_X_reduced.shape[0], n_folds=4)
len(kf)
print(kf)

In [None]:
# 11 line NN
X = train_X_reduced
y = train_y
syn0 = 2*np.random.random((X.shape[1], 100))-1
syn1 = 2*np.random.random((100,1))-1
for j in range(1):
    l1 = 1/(1+np.exp(-(np.dot(X,syn0))))
    l2 = 1/(1+np.exp(-(np.dot(l1, syn1))))
    l2_delta = (y-l2)*(l2*(1-l2))
    l1_delta = l2_delta.dot(syn1.T)*(l1*(1-l1))
    syn1 += l1.T.dot(l2_delta)
    syn0 += X.T.dot(l1_delta)
    print("Epoch count : " + str(j))