In [2]:
import matplotlib.pyplot as plt
import numpy as np
import gzip as gz
from math import log

In [158]:
def pca(data, k_features):
    '''name:         pca
       description:  function takes an original data set an makes the following transformations: 
                     the data is centered about the origin; the covariance is then calculated; 
                     the eigenvalues and eigenvectors of the covariance are found; 
                     the original data is the projected onto the k eigenvectors in descending order 
                     of their eigenvalues, creating a new N x K matrix of k principal components
       dependencies: none
       inputs:       data - is an N x K matrix with the rows representing observations and columns representing features
                     k_features - is an integer representing the number of principal components or features to keep
       outputs:      reduced_data - an N x k_features matrix 
    '''
    # check 0 < k_features <= number of features
    if k_features > 0 and k_features <= data.shape[1]:
      
        # center the data and calculate the covariance matrix (sigma)
        sigma = np.cov(data.T)
        
        # get the eigenvectors of sigma
        eigen_vecs, _, _ = np.linalg.svd(sigma)
        
        # create an empty matrix to hold dimensionally reduced data
        reduced_data = np.empty((data.shape[0], k_features))

        # for each observation x, project x onto eigenvectors
        for observation_idx in range(data.shape[0]):
            reduced_data[observation_idx] = np.dot(eigen_vecs[:,:k_features].T, data[observation_idx,:][:,np.newaxis])[:,np.newaxis].T
            
        # return dimensionally reduced data
        return reduced_data
    
    # print error message
    print ('ERROR: 0 < k_features < %i') % data.shape[1]   

In [159]:
def get_cases(data, cases):
    '''name:         get_cases
       description:  takes and N x K matrix and returns a N' x K martix,
                     where the data is only the date with labels matching the specifiued cases
       dependencies: None
       inputs:       data - N x K matrix of data with labels in col 0
                     cases - tuple of labels to be kept
       outputs:      None
    '''
    # 
    logical_array = np.logical_or.reduce([data[:,0] == case for case in cases])
    return data[logical_array]

In [173]:
def clean_zip_code_data(filename, cases, num_features):
    '''name:         clean_zip_code_data
       description:  opens a gziped file and extracts the specified case and 
                     reduces the number of features using PCA
       dependencies: get_cases
       inputs:       filename - name of file
                     cases - a tuple of two digits
       outputs:      returns N x K matrix of binary labeled (column zero) zipcode data
    '''
    # read the zip code data
    with gz.open(filename) as f:
        train_data = np.loadtxt(f)

    # filter out specify cases
    train_data = get_cases(train_data, cases)

    # split labels and features
    X_train = train_data[:,1:]
    y_train = train_data[:,0][:,np.newaxis]

    #exctract features using pca
    X_train = pca(X_train, num_features)
    
    # return data with labels in column zero
    return np.hstack((y_train,X_train))

In [176]:
class node(object):
    
    def __init__(self)
        self.randomVector = None
        self.gaussian_positive = None
        self.gaussian_negative = None
        self.entropy = None
        self.left = None
        self.right = None
        
    def set_randomVec(self, size):
        '''name:         set_randomVec
           description:  Creates a randmom unit vector
           dependencies: None
           inputs:       size - is the number of feature in the data
           outputs:      None
        '''
        # create a random vector
        x = np.random.rand(size)
        
        # normalize x and set data member
        self.randomVector = x / np.linalg.norm(x)
        
    def set_gaussian(self, data, positive=True):
        '''name:         set_gaussian
           description:  Creates gaussian function (lambda)
           dependencies: None
           inputs:       data - is an N x M matrix for data pertaining to one class
                         positive - boolean value set true if positive labeled data, else negative labeled data
           outputs:      None
        '''
        # caculate the covariance and mean of the data
        sigma = np.cov(data.T)
        mu = np.mean(data, axis=0)
        
        # caculate the constant part of the gaussian formula
        const = 1 / (((2*np.pi)**(data.shape[0] / 2)) * np.det(sigma))
        
        # if positive flag is true set positive gaussian data member 
        if positive:
            self.gaussian_positive = lambda x: const * np.exp(-0.5 * (x-mu).T * np.linalg.inv(sigma) * (x-mu))
            
        # else set negative gaussian data member
        else: 
            self.gaussian_negative = lambda x: const * np.exp(-0.5 * (x-mu).T * np.linalg.inv(sigma) * (x-mu))
            
    def set_entropy(self, num_pos, num_neg):
        '''name:         set_entropy
           description:  Caculates the entropy of the data
           dependencies: None
           inputs:       num_pos - number of positive labeled data observations
                         num_neg - number of negative labeled data observations
           outputs:      None
        '''
        # get total number of observations
        N = num_pos + num_neg
        
        # caculate proablity for both pos and neg classes
        prob_pos = num_pos / N
        prob_neg = num_neg / N
        
        # caculate/set entropy of data
        self.entropy = (prob_pos * log(prob_pos, 2)) + (prob_neg * log(prob_neg, 2))
        
    def set_leafPointer(self, node, left=True):
        '''name:         set_leafPointer
           description:  Creates pointers to the roots children
           dependencies: None
           inputs:       node - is the child 
                         left - boolean value set to true if left child, else right child
           outputs:      None
        '''
        # if flag is true, set new node to left data member
        if left: self.left = node
        
        # otherwise set node to right data member
        else: self.right = node

In [None]:
class random_forest(object):
    
    def __init__(self):
        self.root = None
    
    def train(this,data,num_tree):
        # for i in range num_tree
            # create tree
        
        pass

    def create_tree():
        
        # while number of observations is greater than five
        
            # seperate data by class
            
            # create a new node
            
            # createset nodes random vector
            
            # create/set nodes positive and negative gaussians
            
            # for each observation
            
                # caculate negative and positive probability
                
                # classify observation
                
            # caculate/set nodes entropy of node
            
            
            
        pass
        
    
    def predict(this,data):
        pass
    
    def score(self,y,y_hat):
        pass
    
    def __question(data):
        # separate in to postive and negative class
        
        # get the mean f'or both the positive and negative class
        
        # compute the covariance matrix for both positive and negative classes
        
        # Create Gaussian Function for both positive and negative classes...
        
        # create one random vector that has size of number of features
        
        # for each observation, 
            # project it into random vector
            
            # get probabilities from gaussian functions 
            
            # compare the output, the probabilities and label accor..
        
        #return node, left and right data
        
        pass