In [15]:
import pandas as pd
import numpy as np

def load_data(csv_filename):
  data = pd.read_csv(csv_filename, sep=';')
  data = data.drop(columns=['quality'])
  return data.to_numpy()

def split_data(dataset, ratio):
  amount = int(ratio * len(dataset))
  return dataset[:amount], dataset[amount:]

# Load data
white_wine_data = load_data('whitewine.csv')
red_wine_data = load_data('redwine.csv')

# Split data
train_ratio = 0.9
ww_training, ww_test = split_data(white_wine_data, train_ratio)
rw_training, rw_test = split_data(red_wine_data, train_ratio)

# Print shapes
print("White Wine Training Set Shape:", ww_training.shape)
print("White Wine Test Set Shape:", ww_test.shape)
print("Red Wine Training Set Shape:", rw_training.shape)
print("Red Wine Test Set Shape:", rw_test.shape)

White Wine Training Set Shape: (1439, 11)
White Wine Test Set Shape: (160, 11)
Red Wine Training Set Shape: (1439, 11)
Red Wine Test Set Shape: (160, 11)


In [21]:
def make_centroid(labeled_examples):
  return sum(labeled_examples) / labeled_examples.shape[0]

def euclidean_distance(a, b):
  return np.linalg.norm(a - b)

def experiment(trainingw, trainingr, testw, testr):
  white_centroid, red_centroid = make_centroid(trainingw), make_centroid(trainingr)

  correct_predictions = 0
  total_predictions = 0

  for item in ww_test:
      distance_to_white = euclidean_distance(item, white_centroid)
      distance_to_red = euclidean_distance(item, red_centroid)
      if distance_to_white < distance_to_red:
          correct_predictions += 1
      total_predictions += 1


  for item in rw_test:
      distance_to_white = euclidean_distance(item, white_centroid)
      distance_to_red = euclidean_distance(item, red_centroid)
      if distance_to_red < distance_to_white:
          correct_predictions += 1
      total_predictions += 1

  accuracy = correct_predictions / total_predictions
  print(f"Total Predictions: {total_predictions}")
  print(f"Correct Predictions: {correct_predictions}")
  print(f"Accuracy: {accuracy:.4f}")
  return accuracy
accuracy = experiment(ww_training, rw_training, ww_test, rw_test)
print("Final Accuracy:", accuracy)

Total Predictions: 320
Correct Predictions: 291
Accuracy: 0.9094
Final Accuracy: 0.909375


In [22]:
def cross_validation(ww_data, rw_data, k):
    ww_partition_size = len(ww_data) // k
    rw_partition_size = len(rw_data) // k

    accuracies = []

    for fold in range(k):
      ww_test_start = fold * ww_partition_size
      ww_test_end = (fold + 1) * ww_partition_size
      rw_test_start = fold * rw_partition_size
      rw_test_end = (fold + 1) * rw_partition_size

      ww_test = ww_data[ww_test_start:ww_test_end]
      rw_test = rw_data[rw_test_start:rw_test_end]

      ww_train = np.vstack((ww_data[:ww_test_start], ww_data[ww_test_end:]))
      rw_train = np.vstack((rw_data[:rw_test_start], rw_data[rw_test_end:]))

      accuracy = experiment(ww_train, rw_train, ww_test, rw_test)
      accuracies.append(accuracy)

    average_accuracy = np.mean(accuracies)
    return average_accuracy


k = 5
average_accuracy = cross_validation(white_wine_data, red_wine_data, k)
print(f"Average accuracy over {k}-fold cross-validation: {average_accuracy:.2f}")

Total Predictions: 320
Correct Predictions: 295
Accuracy: 0.9219
Total Predictions: 320
Correct Predictions: 291
Accuracy: 0.9094
Total Predictions: 320
Correct Predictions: 291
Accuracy: 0.9094
Total Predictions: 320
Correct Predictions: 291
Accuracy: 0.9094
Total Predictions: 320
Correct Predictions: 291
Accuracy: 0.9094
Average accuracy over 5-fold cross-validation: 0.91
