## CW Intro to ML


In [3]:
#import necessary librairies

import numpy as np
from numpy.random import default_rng

In [29]:
# dataset read 

dataset = np.loadtxt('wifi_db/clean_dataset.txt')

print(dataset.shape)

(2000, 8)


In [30]:
dataset[:,:-1].shape


(2000, 7)

In [6]:
def read_dataset(filepath):
    """ Read in the dataset from the specified filepath

    Args:
        filepath (str): The filepath to the dataset file

    Returns:
        tuple: returns a tuple of (x, y, classes), each being a numpy array. 
               - x is a numpy array with shape (N, K), 
                   where N is the number of instances
                   K is the number of features/attributes
               - y is a numpy array with shape (N, ), and should be integers from 0 to C-1
                   where C is the number of classes 
               - classes : a numpy array with shape (C, ), which contains the 
                   unique class labels corresponding to the integers in y
    """

    x = []
    y_labels = []
    dataset = np.loadtxt(filepath)
    
    x = dataset[:,:-1]
    y = dataset[:,-1:]
    
    classes = np.unique(y)


    return (x, y, classes)


def split_dataset(x, y, test_proportion, random_generator=default_rng()):
    """ Split dataset into training and test sets, according to the given 
        test set proportion.
    
    Args:
        x (np.ndarray): Instances, numpy array with shape (N,K)
        y (np.ndarray): Class labels, numpy array with shape (N,)
        test_proprotion (float): the desired proportion of test examples 
                                 (0.0-1.0)
        random_generator (np.random.Generator): A random generator

    Returns:
        tuple: returns a tuple of (x_train, x_test, y_train, y_test) 
               - x_train (np.ndarray): Training instances shape (N_train, K)
               - x_test (np.ndarray): Test instances shape (N_test, K)
               - y_train (np.ndarray): Training labels, shape (N_train, )
               - y_test (np.ndarray): Test labels, shape (N_train, )
    """

    shuffled_indices = random_generator.permutation(len(x))
    n_test = round(len(x) * test_proportion)
    n_train = len(x) - n_test
    x_train = x[shuffled_indices[:n_train]]
    y_train = y[shuffled_indices[:n_train]]
    x_test = x[shuffled_indices[n_train:]]
    y_test = y[shuffled_indices[n_train:]]
    return (x_train, x_test, y_train, y_test)



(x, y, classes) = read_dataset('wifi_db/clean_dataset.txt')

seed = 60012
rg = default_rng(seed)
x_train, x_test, y_train, y_test = split_dataset(x, y, 
                                                 test_proportion=0.2, 
                                                 random_generator=rg)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(1600, 7) (1600, 1)
(400, 7) (400, 1)


In [7]:
np.count_nonzero(y_train == 4)

392

In [41]:
# necessary functions to compute data in order to make the decision tree



def H(y, labels): 
    
    entropy = 0
    
    
    N = x.shape[0]
    
    for label in labels:
        prob = np.count_nonzero(y == label)/N
        if prob == 0:
            continue
        entropy -= prob*np.log2(prob)
    
    return entropy

H(y_train, classes)

def remainder(y_left, y_right, labels):
    N = y_left.shape[0] + y_right.shape[0]
    
    rem = (y_left.shape[0]/N)*H(y_left, labels) + (y_right.shape[0]/N)*H(y_right, labels)
    
    return rem

def gain(y_all, y_left, y_right, labels):
    return H(y_all, labels) - remainder(y_left, y_right, labels)
            
    
    
    

In [42]:
gain(y, y_train, y_test, classes)

0.3416331631753122

In [89]:
def find_split(data, labels):
    
    n_features = len(data[0]) - 1
    
    best_gain = gain(data[:-1], data[data[:,0]>data[0][0]][:-1], data[data[:,0]<=data[0][0]][:-1], classes)
    best_thresh = data[0][0]
    best_feat = 0
    
    for i in range(n_features):
        
        best_feat_gain = gain(data[:-1], data[data[:,i]>data[i][0]][:-1], data[data[:,i]<=data[i][0]][:-1], classes)
        best_feat_thresh = data[i][0]
        
        for xi in data[:,i]:

            right = data[data[:,i]>xi]
            left = data[data[:,i]<=xi]
            
            feat_gain = gain(data[:-1], left[:-1], right[:-1], labels)
            
            if feat_gain > best_feat_gain:
                best_feat_gain = feat_gain
                best_feat_thresh = xi
                
        print(best_feat_gain, best_feat_thresh)      
        if best_feat_gain > best_gain:
            best_feat = i
            best_gain = best_feat_gain
            best_thresh = best_feat_thresh
            
            
    right = data[data[:,best_feat] > best_thresh]
    left = data[data[:,best_feat] <= best_thresh]
    
    return best_feat, best_thresh


            
            
   
            
        
        
        
        
        
        

In [87]:
best_feat, best_thresh, right, left = find_split(dataset, classes)

0.9707244042265755 -55.0
0.5094277825857527 -56.0
0.6518768781318678 -56.0
0.9123691914431387 -57.0
0.7469761444598508 -63.0
0.6955053240632996 -81.0
0.6908951549707205 -81.0


In [77]:
print(len(right), len(left))

1202 798


In [90]:
find_split(dataset, classes)

0.9707244042265755 -55.0
0.5094277825857527 -56.0
0.6518768781318678 -56.0
0.9123691914431387 -57.0
0.7469761444598508 -63.0
0.6955053240632996 -81.0
0.6908951549707205 -81.0


(0,
 -55.0,
 array([[-42., -53., -62., ..., -65., -69.,   2.],
        [-44., -55., -61., ..., -72., -68.,   2.],
        [-41., -58., -56., ..., -69., -73.,   2.],
        ...,
        [-52., -61., -48., ..., -80., -80.,   4.],
        [-54., -57., -50., ..., -83., -82.,   4.],
        [-54., -46., -48., ..., -84., -85.,   4.]]),
 array([[-64., -56., -61., ..., -82., -81.,   1.],
        [-68., -57., -61., ..., -85., -85.,   1.],
        [-63., -60., -60., ..., -85., -84.,   1.],
        ...,
        [-62., -59., -46., ..., -87., -88.,   4.],
        [-62., -58., -52., ..., -90., -85.,   4.],
        [-59., -50., -45., ..., -88., -87.,   4.]]))

In [74]:
len(data[:,0])

2000

In [91]:
len(data)

2000

In [108]:
def make_node(split_feat, split_value):
    node = {(split_feat,split_value): ({},{})}
    return node

def make_leaf(data, classes):
    N = len(data)
    result =  np.argmax([len(data[data[:,-1] == label]) for label in classes])/N
    return result



In [109]:
make_leaf(data, classes)

0.0

In [104]:
def decision_tree_learning(dataset, depth):
    feat, thresh = find_split(dataset)
    node = node(feat, thresh)
    
    node[(feat_thresh)] = 

500

500