**Kaggle Competition - Santander**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict # for the data, we don't want to have to create a new array

Reading in the data:

In [2]:
# Just flushing out buffer asap.
import sys
oldsysstdout = sys.stdout
class flushfile():
    def __init__(self, f):
        self.f = f
    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)
    def write(self, x):
        self.f.write(x)
        self.f.flush()
    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)

In [3]:
raw_data = []
train_data = defaultdict(list)
test_data = defaultdict(list)

In [4]:
def read_file(name):
    '''
    Params: name of file
    returns: tuple [keys from the first line of csv file, rest of the lines as arr]
    '''
    data_keys = []
    raw_data = []
    # Reading in the keys of the value
    with open(name, 'r') as f:
        lines = f.readlines()
        for k in lines[0].split(","):
            data_keys.append(k)
        raw_data = lines[1:]
    return [data_keys, raw_data]

In [5]:
def ingest_data(data_keys, data_arr, result, flags, flag_replace):
    '''
    Params: keys from first line, data from the rest of lines, result in the form of a dict(defaultdict allowed)
    returns: 2D ndarray
    '''
    # Taking in the data using the keys
    arr_out = []
    for row in data_arr:
        arr_in = []
        for key, dat in zip(data_keys, row.split(",")):
            num = float(dat)
            if dat in flags:
                result[key].append(flag_replace)
                arr_in.append(flag_replace)
            else:
                result[key].append(num)
                arr_in.append(num)
        arr_out.append(arr_in)
    return np.array(arr_out, dtype=np.float64)

In [6]:
train_key, raw_train_data = read_file('train.csv')
test_key, raw_test_data = read_file('test.csv')

In [7]:
train_data = ingest_data(train_key, raw_train_data, train_data, ["9999999999", "-999999"], -1)
# test_data = ingest_data(test_key, raw_test_data, test_data) Remove this line temporarily for testing

In [8]:
# Test to see the size of train_data
print(len(train_data))
print(len(test_data))
print(train_data.shape)

76020
0
(76020, 371)


In [9]:
# Prune the zeros
columns_removed = np.all(train_data == 0, axis=0)
train_data = train_data[:,~columns_removed]
print(train_data.shape)

(76020, 337)


In [10]:
# split the values into X and y
train_X = train_data[:,:-1]
train_y = train_data[:,-1]
print(train_X.shape)
print(train_y[:50]) # Okay we have at least 1 here, sanity check

(76020, 336)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


Data preprocessing

In [11]:
# Normalize
train_X -= np.mean(train_X, axis=0)
train_X /= np.std(train_X, axis=0)
cov = np.dot(train_X.T, train_X)/train_X.shape[0]
U,S,V = np.linalg.svd(cov)
train_X_reduced = np.dot(train_X, U[:,:100])
print(train_X_reduced.shape)

(76020, 100)


In [12]:
# Create CV test set
from sklearn.cross_validation import KFold
kf = KFold(train_X_reduced.shape[0], n_folds=4)
len(kf)
print(kf)

sklearn.cross_validation.KFold(n=76020, n_folds=4, shuffle=False, random_state=None)


In [13]:
# 11 line NN

def sigmoid(x, syn):
    return 1/(1+np.exp(-(np.dot(x, syn)/1000)))

def forward(syn0, syn1, X):
    l1 = sigmoid(X, syn0)
    l2 = sigmoid(l1, syn1)
    return [l1, l2]

def train_iter(X, y, verbose = True):
    syn0 = 2*np.random.random((X.shape[1], 200))-1
    syn1 = 2*np.random.random((200,1))-1
    for j in range(100):
        l1, l2 = forward(syn0, syn1, X)
        l2_delta = (y.reshape(-1,1)-l2)*(l2*(1-l2))
        l1_delta = l2_delta.dot(syn1.T)*(l1*(1-l1))
        syn1 += l1.T.dot(l2_delta)*0.01
        syn0 += X.T.dot(l1_delta)*0.01
        cost = -np.sum(np.log(l2)*y.reshape(-1, 1) + np.log(1-l2)*(1-y.reshape(-1, 1)))
        if verbose:
            print("Epoch count : " + str(j))
            print("Cost : " + str(cost))
    return [syn0, syn1]

In [18]:
def predict(syn0, syn1, X, y):
    # try to predict using this naïve approach
    _, ans = forward(syn0, syn1, X)
    # turn ans into binary vector
    ans = ans > 0.25
    return [ans, np.sum(ans == y.reshape(-1, 1))/y.shape[0]]
    

In [19]:
X = train_X_reduced[:50000,:] #500 batch
y = train_y[:50000].reshape(-1, 1)
kf = KFold(X.shape[0], n_folds=4)

for train_index, test_index in kf:
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    syn0, syn1 = train_iter(X_train, y_train, False)
    ans, acc = predict(syn0, syn1, X_test, y_test)
    print("The # of 1's : " + str(np.sum(ans)) + " and acc : " + str(acc))
    

TRAIN: [12500 12501 12502 ..., 49997 49998 49999] TEST: [    0     1     2 ..., 12497 12498 12499]
The # of 1's : 6 and acc : 0.9608
TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [12500 12501 12502 ..., 24997 24998 24999]
The # of 1's : 14 and acc : 0.9564
TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [25000 25001 25002 ..., 37497 37498 37499]
The # of 1's : 21 and acc : 0.964
TRAIN: [    0     1     2 ..., 37497 37498 37499] TEST: [37500 37501 37502 ..., 49997 49998 49999]
The # of 1's : 16 and acc : 0.9616
