In [40]:
import pandas as pd
import numpy as np

def load_data(csv_filename):
    df = pd.read_csv(csv_filename, sep=";")
    dfclean = df.loc[:, df.columns != 'quality']
    pre = np.vstack([dfclean[:]]) # could use dfclean.to_numpy() but wasn't thinking about it
    if csv_filename == "redwine.csv":
        array = np.hstack((np.ones([len(dfclean), 1]), pre)) #puts a 1 at the beginning of the array so we know if its red or white wine
    else:
        array = np.hstack((np.zeros([len(dfclean), 1]), pre)) #0 for whitewine
    return array

def split_data(dataset, ratio):
    amount = int(ratio * len(dataset))
    traintest = (dataset[:amount], dataset[amount:])
    return traintest

def make_centroid(labeled_examples):
    return sum(labeled_examples[:,1:]) / labeled_examples.shape[0] #copied from ml lectures notes as said in instructions

def euclidean_distance(a, b): #also copied from lecture
    # Return the norm (length) of the vector a - b
    return np.linalg.norm(a - b)

def experiment(trainingw, trainingr, testw, testr):
    wcentroid, rcentroid = make_centroid(trainingw), make_centroid(trainingr)
    correctw, incorrectw = 0, 0
    correctr, incorrectr = 0, 0
    for i in testw:
        rdist = euclidean_distance(rcentroid, i[1:])
        wdist = euclidean_distance(wcentroid, i[1:])
        if rdist < wdist:
            incorrectw += 1
        else:
            correctw += 1
    for i in testr:
        rdist = euclidean_distance(rcentroid, i[1:])
        wdist = euclidean_distance(wcentroid, i[1:])
        if rdist < wdist:
            correctr += 1
        else:
            incorrectr += 1

    print(str(len(testw)) + " number of predictions for white wine")
    print("total correct prediction: " + str(correctw))
    print("percent accuracy:" + str(correctw/len(testw)))
    
    print(str(len(testr)) + " number of predictions for red wine")
    print("total correct prediction: " + str(correctr))
    print("percent accuracy:" + str(correctr/len(testr)))

redwine = load_data("redwine.csv")
whitewine = load_data("whitewine.csv")

trainingr, testr = split_data(redwine, .9)
trainingw, testw = split_data(whitewine, .9)

In [41]:
experiment(trainingw, trainingr, testw, testr)

160 number of predictions for white wine
total correct prediction: 142
percent accuracy:0.8875
160 number of predictions for red wine
total correct prediction: 149
percent accuracy:0.93125
