Name: Shiska Raut <br>
ID:   1001526329

In [118]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
%matplotlib inline

## Read training/evaluation data

**Argument(s):** 
1) filename: name of a .txt file with each line containing training/evaluation features(x) and label(y) in the following format:
((x1, x2, .....xn), y) <br>
2) dtype_x : datatype of features <br>
3) dtype_y: datatype of label <br>

**Return(s):** 'X, Y' where X is a numpy array of feature vectors and Y is the target label vector.
Note: <br>
Each column in the array(s) epresents a single datapoint. <br>

In [119]:
def get_X_Y_arrays(filename, dtype_x, dtype_y):
    try:
        f = open(filename, 'r')
    except OSError:
        print(f'{filename} could not be opened.\n')
        sys.exit()

    # initialize list to store feature and labels for training data
    features = []
    labels = []

    with f:
        line = f.readline()
        while line != '':
            # strip newline and outer parenthesis
            line = line.strip('\n')
            line = line.strip('( )')

            # extrace label and append to labels list
            single_label = line.split('), ')[-1]
            labels.append(single_label)

            # extrace features and append to features list
            feat = line.split('), ')[0].split(', ')
            features.append(feat)

            # read next line
            line = f.readline()

        # create dataframe of features and append labels
        X = np.array(features, dtype=dtype_x, ndmin=2)

        # convert labels list to array
        Y = np.array(labels, dtype=dtype_y)

        return X, Y

### Helper Functions

In [120]:
# splits data into training and test set
# first 6 datapoints become the test datapoints
def split_data(X, Y):

    X_test = X[0:6, :]
    Y_test = Y[0:6]

    X_train = np.delete(X, np.s_[0:6], axis = 0)
    Y_train = np.delete(Y, np.s_[0:6])

    return X_train, Y_train, X_test, Y_test

# given an array of categorical values, returns encoded index
def get_encoded_Y(Y):
    
    # get number of samples
    n_samp = Y.shape[0]
    
    # get all unique labels
    uniq_labels = set(Y.tolist())
    
    # create dictionary to store encodings
    encoding_dict = {}
    for i, label in enumerate(uniq_labels):
        encoding_dict[label] = i
    
    # get number of labels
    n_labels = len(uniq_labels)
    
    # create encoded arr
    encoded_arr = np.zeros(n_samp, dtype = np.int16)
    for i in range(n_samp):
        encoded_arr[i] = encoding_dict[Y[i]]
        
    # reverse dictionary for decoding
    inv_encoding_dict = {v: k for k, v in encoding_dict.items()}
        
    return encoded_arr, inv_encoding_dict

### Provide Filename

In [121]:
fname = '2_data.txt'

# read data 
X, Y = get_X_Y_arrays(fname, float, str)

# get encoded Y
encoded_Y, decoding_dict = get_encoded_Y(Y)

# split data into training and test
X_train, Y_train, X_test, Y_test = split_data(X, encoded_Y)

# create dataframe
df_train = pd.DataFrame(np.concatenate((X_train, Y_train.reshape(X_train.shape[0], 1)), axis = 1),
                  columns = ['height', 'diameter', 'weight', 'hue', 'labels'], dtype = np.float32)

df_test = pd.DataFrame(np.concatenate((X_test, Y_test.reshape(X_test.shape[0], 1)), axis = 1),
                  columns = ['height', 'diameter', 'weight', 'hue', 'labels'], dtype = np.float32)

In [122]:
# view training data
df_train.head(10)

Unnamed: 0,height,diameter,weight,hue,labels
0,0.05,0.067191,0.108086,3.612651,1.0
1,0.097218,0.134276,0.530302,3.581389,1.0
2,0.082333,0.052483,0.207009,4.268849,1.0
3,0.053522,0.03,0.1,3.402305,1.0
4,0.092504,0.068184,0.422383,2.651294,2.0
5,0.142357,0.139163,0.471437,4.120488,0.0
6,0.076536,0.085016,0.263367,3.441823,1.0
7,0.168621,0.15,0.597853,4.768459,0.0
8,0.162522,0.15,0.582545,3.557087,0.0
9,0.078976,0.049397,0.151682,3.824761,1.0


In [123]:
df_test

Unnamed: 0,height,diameter,weight,hue,labels
0,0.103335,0.079225,0.187839,2.786577,0.0
1,0.185991,0.142774,0.608519,5.647819,0.0
2,0.066903,0.04501,0.118447,2.780018,1.0
3,0.12899,0.15,0.433086,3.13923,0.0
4,0.05,0.076688,0.178602,3.865847,1.0
5,0.134085,0.084665,0.271117,3.604623,0.0


In [124]:
df_train[df_train['height'] < 0.15]['height'].count()

88

In [125]:
df_train['labels'].mode()[0]

0.0

In [126]:
list(df_train.columns[:-1])

['height', 'diameter', 'weight', 'hue']

In [127]:
fname_small = '2_a_train.txt'

# read data 
X, Y = get_X_Y_arrays(fname_small, float, str)

# create dataframe
df_small = pd.DataFrame(np.concatenate((X, Y.reshape(X.shape[0], 1)), axis = 1),
                  columns = ['height', 'diameter', 'weight', 'hue', 'labels'])


In [129]:
df_small.sort_values('height', ascending = True, ignore_index = True)[df_small['height'] <= 0.1]

TypeError: '<=' not supported between instances of 'str' and 'float'

In [83]:
# calculates thresholds 
height_ser = df_small.sort_values('height', ascending = True,
                                  ignore_index = True)['height'].astype('float')
print('All thresholds for "height" are:\n')
for i in range(5):
    print(f'T{i+1}: {(height_ser[i] + height_ser[i+1])/2}')

All thresholds for "height" are:

 T1: 0.07330346559893901
 T2: 0.086461750222544
 T3: 0.09858549301838049
 T4: 0.11294496166415
 T5: 0.13982581241078


In [56]:
df_small.sort_values('diameter')

Unnamed: 0,height,diameter,weight,hue,labels
2,0.085536050586897,0.03,0.11915260588651,2.2104287108141,Metal
3,0.061070880610981,0.056668572017189,0.24657746013871,4.1755360255283,Ceramic
0,0.087387449858191,0.060081931648431,0.31979451078728,2.8496262373309,Ceramic
1,0.10978353617857,0.091370057029853,0.47387481406305,3.6590249754078,Metal
5,0.11610638714973,0.097522332486657,0.24036446449462,3.4677618371783,Plastic
4,0.16354523767183,0.12624593368025,0.44889932007996,3.4711554454503,Plastic


In [78]:
# calculates thresholds 
diameter_ser = df_small.sort_values('diameter', ascending = True,
                                  ignore_index = True)['diameter'].astype('float')
print('All thresholds for "diameter" are:\n')
for i in range(5):
    print(f'T{i+1}: {(diameter_ser[i] + diameter_ser[i+1])/2}')

All thresholds for "diameter" are:

0.0433342860085945
0.058375251832810005
0.075725994339142
0.094446194758255
0.11188413308345349


In [57]:
df_small.sort_values('weight')

Unnamed: 0,height,diameter,weight,hue,labels
2,0.085536050586897,0.03,0.11915260588651,2.2104287108141,Metal
5,0.11610638714973,0.097522332486657,0.24036446449462,3.4677618371783,Plastic
3,0.061070880610981,0.056668572017189,0.24657746013871,4.1755360255283,Ceramic
0,0.087387449858191,0.060081931648431,0.31979451078728,2.8496262373309,Ceramic
4,0.16354523767183,0.12624593368025,0.44889932007996,3.4711554454503,Plastic
1,0.10978353617857,0.091370057029853,0.47387481406305,3.6590249754078,Metal


In [79]:
# calculates thresholds 
weight_ser = df_small.sort_values('weight', ascending = True,
                                  ignore_index = True)['weight'].astype('float')
print('All thresholds for weight" are:\n')
for i in range(5):
    print(f'T{i+1}: {(weight_ser[i] + weight_ser[i+1])/2}')

All thresholds for weight" are:

0.179758535190565
0.24347096231666499
0.283185985462995
0.38434691543362
0.461387067071505


In [80]:
df_small.sort_values('hue')

Unnamed: 0,height,diameter,weight,hue,labels
2,0.085536050586897,0.03,0.11915260588651,2.2104287108141,Metal
0,0.087387449858191,0.060081931648431,0.31979451078728,2.8496262373309,Ceramic
5,0.11610638714973,0.097522332486657,0.24036446449462,3.4677618371783,Plastic
4,0.16354523767183,0.12624593368025,0.44889932007996,3.4711554454503,Plastic
1,0.10978353617857,0.091370057029853,0.47387481406305,3.6590249754078,Metal
3,0.061070880610981,0.056668572017189,0.24657746013871,4.1755360255283,Ceramic


In [81]:
# calculates thresholds 
hue_ser = df_small.sort_values('hue', ascending = True,
                                  ignore_index = True)['hue'].astype('float')
print('All thresholds for "hue" are:\n')
for i in range(5):
    print(f'T{i+1}: {(hue_ser[i] + hue_ser[i+1])/2}')

All thresholds for "hue" are:

2.5300274740725
3.1586940372546
3.4694586413143
3.56509021042905
3.91728050046805


In [84]:
arr = np.array([1,2,2])

In [86]:
-(1/5)*math.log(1/5, 2)-(2/5)*math.log(2/5, 2)-(2/5)*math.log(2/5, 2)

1.5219280948873621

In [87]:
math.log(1/3, 2)

-1.5849625007211563

In [88]:
df_node1 = df_small.sort_values('height', ascending = True).head(4)
df_node1

Unnamed: 0,height,diameter,weight,hue,labels
3,0.061070880610981,0.056668572017189,0.24657746013871,4.1755360255283,Ceramic
2,0.085536050586897,0.03,0.11915260588651,2.2104287108141,Metal
0,0.087387449858191,0.060081931648431,0.31979451078728,2.8496262373309,Ceramic
1,0.10978353617857,0.091370057029853,0.47387481406305,3.6590249754078,Metal


In [90]:
# calculates thresholds 
height_ser = df_node1.sort_values('height', ascending = True,
                                  ignore_index = True)['height'].astype('float')
print('All thresholds for "height" are:\n')
for i in range(3):
    print(f'T{i+1}: {(height_ser[i] + height_ser[i+1])/2}')

All thresholds for "height" are:

T1: 0.07330346559893901
T2: 0.086461750222544
T3: 0.09858549301838049


In [91]:
df_node1.sort_values('diameter', ascending = True)

Unnamed: 0,height,diameter,weight,hue,labels
2,0.085536050586897,0.03,0.11915260588651,2.2104287108141,Metal
3,0.061070880610981,0.056668572017189,0.24657746013871,4.1755360255283,Ceramic
0,0.087387449858191,0.060081931648431,0.31979451078728,2.8496262373309,Ceramic
1,0.10978353617857,0.091370057029853,0.47387481406305,3.6590249754078,Metal


In [93]:
# calculates thresholds 
diameter_ser = df_node1.sort_values('diameter', ascending = True,
                                  ignore_index = True)['diameter'].astype('float')
print('All thresholds for "diameter" are:\n')
for i in range(3):
    print(f'T{i+1}: {(diameter_ser[i] + diameter_ser[i+1])/2}')

All thresholds for "diameter" are:

T1: 0.0433342860085945
T2: 0.058375251832810005
T3: 0.075725994339142


In [94]:
df_node1.sort_values('weight', ascending = True)

Unnamed: 0,height,diameter,weight,hue,labels
2,0.085536050586897,0.03,0.11915260588651,2.2104287108141,Metal
3,0.061070880610981,0.056668572017189,0.24657746013871,4.1755360255283,Ceramic
0,0.087387449858191,0.060081931648431,0.31979451078728,2.8496262373309,Ceramic
1,0.10978353617857,0.091370057029853,0.47387481406305,3.6590249754078,Metal


In [97]:
# calculates thresholds 
weight_ser = df_node1.sort_values('weight', ascending = True,
                                  ignore_index = True)['weight'].astype('float')
print('All thresholds for "weight" are:\n')
for i in range(3):
    print(f'T{i+1}: {(weight_ser[i] + weight_ser[i+1])/2}')

All thresholds for "weight" are:

T1: 0.18286503301260998
T2: 0.283185985462995
T3: 0.39683466242516496
