## CW Intro to ML


In [13]:
#import necessary librairies

import numpy as np
from numpy.random import default_rng

In [6]:
# dataset read 

dataset = np.loadtxt('wifi_db/clean_dataset.txt')

print(dataset.shape)

(2000, 8)


In [16]:
dataset[:,:-1].shape


(2000, 7)

In [18]:
def read_dataset(filepath):
    """ Read in the dataset from the specified filepath

    Args:
        filepath (str): The filepath to the dataset file

    Returns:
        tuple: returns a tuple of (x, y, classes), each being a numpy array. 
               - x is a numpy array with shape (N, K), 
                   where N is the number of instances
                   K is the number of features/attributes
               - y is a numpy array with shape (N, ), and should be integers from 0 to C-1
                   where C is the number of classes 
               - classes : a numpy array with shape (C, ), which contains the 
                   unique class labels corresponding to the integers in y
    """

    x = []
    y_labels = []
    dataset = np.loadtxt(filepath)
    
    x = dataset[:,:-1]
    y = dataset[:,-1:]
    
    classes = np.unique(y)


    return (x, y, classes)


def split_dataset(x, y, test_proportion, random_generator=default_rng()):
    """ Split dataset into training and test sets, according to the given 
        test set proportion.
    
    Args:
        x (np.ndarray): Instances, numpy array with shape (N,K)
        y (np.ndarray): Class labels, numpy array with shape (N,)
        test_proprotion (float): the desired proportion of test examples 
                                 (0.0-1.0)
        random_generator (np.random.Generator): A random generator

    Returns:
        tuple: returns a tuple of (x_train, x_test, y_train, y_test) 
               - x_train (np.ndarray): Training instances shape (N_train, K)
               - x_test (np.ndarray): Test instances shape (N_test, K)
               - y_train (np.ndarray): Training labels, shape (N_train, )
               - y_test (np.ndarray): Test labels, shape (N_train, )
    """

    shuffled_indices = random_generator.permutation(len(x))
    n_test = round(len(x) * test_proportion)
    n_train = len(x) - n_test
    x_train = x[shuffled_indices[:n_train]]
    y_train = y[shuffled_indices[:n_train]]
    x_test = x[shuffled_indices[n_train:]]
    y_test = y[shuffled_indices[n_train:]]
    return (x_train, x_test, y_train, y_test)



(x, y, classes) = read_dataset('wifi_db/clean_dataset.txt')

seed = 60012
rg = default_rng(seed)
x_train, x_test, y_train, y_test = split_dataset(x, y, 
                                                 test_proportion=0.2, 
                                                 random_generator=rg)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(1600, 7) (1600, 1)
(400, 7) (400, 1)


In [25]:
np.count_nonzero(y_train == 4)

392

In [31]:
# necessary functions to compute data in order to make the decision tree



def H(y, labels): 
    
    entropy = 0
    
    
    N = x.shape[0]
    
    for label in labels:
        prob = np.count_nonzero(y == label)/N
        if prob == 0:
            continue
        entropy -= prob*np.log2(prob)
    
    return entropy

H(y_train, classes)

def remainder(y_left, y_right, labels):
    N = y_left.shape[0] + y_right.shape[0]
    
    rem = (y_left.shape[0]/N)*H(y_left, labels) + (y_right.shape[0]/N)*H(y_right, labels)
    
    return rem

def gain(y_all, x_left, y_left, x_right, y_right, labels):
    return H(x_all, y_all, labels) - remainder(x_left, y_left, x_right, y_right, labels)
            
    
    
    

In [32]:
gain(x, y, x_train, y_train, x_test, y_test, classes)

0.0017143071306899849