In [29]:
import matplotlib.pyplot as plt
import numpy as np
import gzip as gz
from math import log
import Queue

In [24]:
def pca(data, k_features):
    '''name:         pca
       description:  function takes an original data set an makes the following transformations: 
                     the data is centered about the origin; the covariance is then calculated; 
                     the eigenvalues and eigenvectors of the covariance are found; 
                     the original data is the projected onto the k eigenvectors in descending order 
                     of their eigenvalues, creating a new N x K matrix of k principal components
       dependencies: none
       inputs:       data - is an N x K matrix with the rows representing observations and columns representing features
                     k_features - is an integer representing the number of principal components or features to keep
       outputs:      reduced_data - an N x k_features matrix 
    '''
    # check 0 < k_features <= number of features
    if k_features > 0 and k_features <= data.shape[1]:
      
        # center the data and calculate the covariance matrix (sigma)
        sigma = np.cov(data.T)
        
        # get the eigenvectors of sigma
        eigen_vecs, _, _ = np.linalg.svd(sigma)
        
        # create an empty matrix to hold dimensionally reduced data
        reduced_data = np.empty((data.shape[0], k_features))

        # for each observation x, project x onto eigenvectors
        for observation_idx in range(data.shape[0]):
            reduced_data[observation_idx] = np.dot(eigen_vecs[:,:k_features].T, data[observation_idx,:][:,np.newaxis])[:,np.newaxis].T
            
        # return dimensionally reduced data
        return reduced_data
    
    # print error message
    print ('ERROR: 0 < k_features < %i') % data.shape[1]   

In [25]:
def get_cases(data, cases):
    '''name:         get_cases
       description:  takes and N x K matrix and returns a N' x K martix,
                     where the data is only the date with labels matching the specifiued cases
       dependencies: None
       inputs:       data - N x K matrix of data with labels in col 0
                     cases - tuple of labels to be kept
       outputs:      None
    '''
    # get logical array by examining the colmun where the labels match the cases
    logical_array = np.logical_or.reduce([data[:,0] == case for case in cases])
    
    # return the new data matrix with binary labels
    return data[logical_array]

In [26]:
def clean_zip_code_data(filename, cases, num_features):
    '''name:         clean_zip_code_data
       description:  opens a gziped file and extracts the specified cases, 
                     converts the two cases to binary and reduces the number of features using PCA
       dependencies: get_cases
       inputs:       filename - name of file
                     cases - a tuple of two digits
       outputs:      returns N x K matrix of binary labeled (column zero) zipcode data
    '''
    # read the zip code data
    with gz.open(filename) as f:
        train_data = np.loadtxt(f)

    # filter out specify cases
    train_data = get_cases(train_data, cases)

    # split labels and features
    X_train = train_data[:,1:]
    y_train = train_data[:,0][:,np.newaxis]
    
    # convert labels to binary
    y_train = np.where(y_train == cases[0], 0, 1)

    #exctract features using pca
    X_train = pca(X_train, num_features)
    
    # return data with labels in column zero
    return np.hstack((y_train,X_train))

In [27]:
class node(object):
    
    def __init__(self):
        self.randomVector = None
        self.gaussian_positive = None
        self.gaussian_negative = None
        self.entropy = None
        self.left = None
        self.right = None
        
    def set_randomVec(self, size):
        '''name:         set_randomVec
           description:  Creates a randmom unit vector
           dependencies: None
           inputs:       size - is the number of feature in the data
           outputs:      None
        '''
        # create a random vector
        x = np.random.rand(size)
        
        # normalize x and set data member
        self.randomVector = x / np.linalg.norm(x)
        
    def set_gaussian(self, data, positive=True):
        '''name:         set_gaussian
           description:  Creates gaussian function (lambda)
           dependencies: None
           inputs:       data - is an N x M matrix for data pertaining to one class
                         positive - boolean value set true if positive labeled data, else negative labeled data
           outputs:      None
        '''
        # caculate the covariance and mean of the data
        sigma = np.cov(data.T)
        mu = np.mean(data, axis=0)
        
        # caculate the constant part of the gaussian formula
        const = 1 / (((2*np.pi)**(data.shape[0] / 2)) * np.det(sigma))
        
        # if positive flag is true set positive gaussian data member 
        if positive:
            self.gaussian_positive = lambda x: const * np.exp(-0.5 * (x-mu).T * np.linalg.inv(sigma) * (x-mu))
            
        # else set negative gaussian data member
        else: 
            self.gaussian_negative = lambda x: const * np.exp(-0.5 * (x-mu).T * np.linalg.inv(sigma) * (x-mu))
            
    def set_entropy(self, num_pos, num_neg):
        '''name:         set_entropy
           description:  Caculates the entropy of the data
           dependencies: None
           inputs:       num_pos - number of positive labeled data observations
                         num_neg - number of negative labeled data observations
           outputs:      None
        '''
        # get total number of observations
        N = num_pos + num_neg
        
        # caculate proablity for both pos and neg classes
        prob_pos = num_pos / N
        prob_neg = num_neg / N
        
        # caculate/set entropy of data
        self.entropy = (prob_pos * log(prob_pos, 2)) + (prob_neg * log(prob_neg, 2))

In [28]:
class random_forest(object):
    
    def __init__(self, num_trees=1, stoping_condition=5, pruning_condition=0.1):
        self.num_trees = num_trees
        self.trees = []
        self.stoping_condition = stoping_condition
        self.pruning_condition = pruning_condition

    def __create_tree(self, data):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        # get nmuber of features
        num_features = data.shape[1]
        
        # create queue
        queue = Queue.Queue()
        
        # create root node and add to queue
        root = node()
        queue.put((root, data))
        
        # while queue in not empty
        while not queue.empty():
        
            # get current node
            current_tupple = queue.get()
            current_node = current_tupple[0]
            current_data = current_tupple[1]
            
            # get number of observations
            num_observation = current_data.shape[0]
            
            # create empty vector for predicted labels
            predicted_labels = np.empty((num_observation, 1))
        
            # create/set nodes random vector
            current_node.set_randomVec(num_features)
            
            # create/set nodes positive and negative gaussians
            current_node.set_gaussain(current_data)
            current_node.set_gaussain(current_data, positive=False)
            
            # for each observation
            for observation_idx in range(num_observation)
            
                # caculate negative and positive probability
                pos_prob = current_node.gaussian_positive(current_data[observation_idx,:])
                neg_prob = current_node.gaussian_negative(current_data[observation_idx,:])
                
                # classify observation
                predicted_labels[observation_idx] = 0 if pos_prob > neg_prob else 1
                
            # caculate/set nodes entropy of node
            num_pred_pos = predicted_labels[predicted_labels == 0].shape[0]
            num_pred_neg = predicted_labels[predicted_labels == 1].shape[0]
            current_node.set_entropy(num_pred_pos, num_pred_neg)
            
            # combine predicted labels with data
            new_data = np.vstack((predicted_labels, current_data))
            
            # get number of observations predicted pos and neg
            num_obs_pos = new_data[new_data[:,0] == 0].shape[0]
            num_obs_neg = new_data[new_data[:,0] == 1].shape[0]
            
            # if the number of positive observations is greater than five
            if num_obs_pos > stoping_condition:
                
                # create left node and add node and data predicted positive to queue
                current_node.left = node()
                queue.put((current_node.left, new_data[new_data[:,0] == 0][:,1:]))
                
            # if the number of negative observations is greater than five
            if num_obs_neg > stoping_condition:
                
                # create right node and add node and data predicted negative to queue
                current_node.right = node()
                queue.put((current_node.right, new_data[new_data[:,0] == 1][:,1:]))
            
        # return root of tree
        return root
    
    def train(self, data):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        # create specified number of trees
        for tree in range(self.num_trees):
            
            # add each tree to list of trees
            self.trees.append(self.__create_tree(data))
    
    def predict(self,data):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        pass
    
    def score(self,y,y_hat):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        pass

In [23]:
# TODO 
# git rid of entropy and us a score instead
# add label data member to node and set label while building the tree
# create prune member function that sets a nodes children to None if the score is good


#rf = random_forest()
#rf.train(None)

AttributeError: 'NoneType' object has no attribute 'shape'