Name: Shiska Raut <br>
ID:   1001526329

In [1]:
import numpy as np
import sys
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

## Read training/evaluation data

**Argument(s):** 
1) filename: name of a .txt file with each line containing training/evaluation features(x) and label(y) in the following format:
((x1, x2, .....xn), y) <br>
2) dtype_x : datatype of features <br>
3) dtype_y: datatype of label <br>

**Return(s):** 'X, Y' where X is a numpy array of feature vectors and Y is the target label vector.
Note: <br>
Each column in the array(s) epresents a single datapoint. <br>

In [2]:
def get_X_Y_arrays(filename, dtype_x, dtype_y):
    try:
        f = open(filename, 'r')
    except OSError:
        print(f'{filename} could not be opened.\n')
        sys.exit()

    # initialize list to store feature and labels for training data
    features = []
    labels = []

    with f:
        line = f.readline()
        while line != '':
            # strip newline and outer parenthesis
            line = line.strip('\n')
            line = line.strip('( )')

            # extrace label and append to labels list
            single_label = line.split('), ')[-1]
            labels.append(single_label)

            # extrace features and append to features list
            feat = line.split('), ')[0].split(', ')
            features.append(feat)

            # read next line
            line = f.readline()

        # create dataframe of features and append labels
        X = np.array(features, dtype=dtype_x, ndmin=2)

        # convert labels list to array
        Y = np.array(labels, dtype=dtype_y)

        return X, Y

### Helper Functions

In [3]:
# splits data into training and test set
# first 6 datapoints become the test datapoints
def split_data(X, Y):

    X_test = X[0:6, :]
    Y_test = Y[0:6]

    X_train = np.delete(X, np.s_[0:6], axis = 0)
    Y_train = np.delete(Y, np.s_[0:6])

    return X_train, Y_train, X_test, Y_test

### Provide Filename

In [5]:
fname = '2_data.txt'

# read data 
X, Y = get_X_Y_arrays(fname, float, str)

# split data into training and test
X_train, Y_train, X_test, Y_test = split_data(X, Y)

# create dataframe
df_train = pd.DataFrame(np.concatenate((X_train, Y_train.reshape(X_train.shape[0], 1)), axis = 1),
                  columns = ['height', 'diameter', 'weight', 'hue', 'labels'])

df_test = pd.DataFrame(np.concatenate((X_test, Y_test.reshape(X_test.shape[0], 1)), axis = 1),
                  columns = ['height', 'diameter', 'weight', 'hue', 'labels'])

In [6]:
# view training data
df_train.head(10)

Unnamed: 0,height,diameter,weight,hue,labels
0,0.05,0.0671906303248,0.10808586147371,3.6126513788224,Metal
1,0.097217788706047,0.13427563002805,0.53030238548777,3.5813892162698,Metal
2,0.082333041884006,0.05248308057853,0.20700932847764,4.268849537387,Metal
3,0.053522380049439,0.03,0.1,3.4023052441992,Metal
4,0.092504061632996,0.068184173935539,0.42238263570059,2.6512940785013,Ceramic
5,0.14235682064034,0.13916322473525,0.47143671558967,4.1204883858317,Plastic
6,0.076535519759415,0.08501625336107,0.26336689388836,3.4418232708229,Metal
7,0.16862061485994,0.15,0.59785276497457,4.7684587201969,Plastic
8,0.16252194081581,0.15,0.58254500449345,3.5570866732765,Plastic
9,0.078975908462549,0.049397314581861,0.15168194290288,3.8247614902385,Metal


In [7]:
df_test

Unnamed: 0,height,diameter,weight,hue,labels
0,0.10333502161284,0.079225478849717,0.18783851976607,2.7865764181292,Plastic
1,0.18599119700366,0.14277372592514,0.60851885865374,5.6478192233182,Plastic
2,0.066902806537039,0.045010278911305,0.11844695802032,2.7800184198705,Metal
3,0.12899038690993,0.15,0.43308569967936,3.1392299124701,Plastic
4,0.05,0.076688434366098,0.17860228933314,3.8658468676698,Metal
5,0.13408468373063,0.084665014393071,0.27111709333315,3.6046228201569,Plastic
