Helpers

In [0]:
def prepeare_data(wine_data, features, good_threshold, bad_threshold):
  return (wine_data.loc[(wine_data.quality >= good_threshold) | (wine_data.quality <= bad_threshold)])[features + ['quality', 'good']].copy().reset_index(drop=True)

def get_Xy(data):
  return data.drop(columns=['quality', 'good']).copy(), data.good.copy()

# V.1: Exploring the green reds


a) Plot a scatterplot matrix

In [0]:
import pandas as pd

data_url = 'https://raw.githubusercontent.com/SashaKryzh/ft_sommelier_resources/master/winequality-red.csv'
red_df = pd.read_csv(data_url, delimiter=';')
red_df.head()

In [0]:
from google.colab import files
import matplotlib.pyplot as plt

def plot_scatter_matrix(data, good_threshold, bad_threshold, save_plot=False):
  names = list(data.columns)
  num_data, num_vars = data.shape

  fig, axes = plt.subplots(nrows=num_vars, ncols=num_vars, figsize=(50,50))
  fig.subplots_adjust(hspace=0.03, wspace=0.03)

  for ax in axes.flat:
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)

  data_to_show = data.loc[(data.quality >= good_threshold) | (data.quality <= bad_threshold)]
  colors = list(map(lambda x: 'green' if x >= good_threshold else 'red', data_to_show['quality']))

  for i in range(num_vars):
    for j in range(num_vars):
      if i == j:
        continue
      axes[i, j].scatter(data_to_show.iloc[:, j], data_to_show.iloc[:, i], c=colors, alpha=0.3)

  for i, label in enumerate(names[:num_vars]):
    axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction', ha='center', va='center')

  if save_plot:
    plt.savefig("scatter.png", dpi=200)
    files.download("scatter.png")

  plt.show()

In [0]:
plot_scatter_matrix(red_df, 6, 5, save_plot=False)

b) Analysis
* Wines with less volatile acidity tend to be better.
* Wines with low citric acid are usually bad.
* Wines with low pH are good
* Wines with low sulphades are bad





# V.2: Learning to perceptron

a) Implement a perceptron & b) Train function

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

data_url = 'https://raw.githubusercontent.com/SashaKryzh/ft_sommelier_resources/master/winequality-red.csv'
wine_data = pd.read_csv(data_url, delimiter=';')

good_wine = 6
wine_data['good'] = wine_data['quality'].apply(lambda x: 1 if x >= good_wine else 0)

In [0]:
import random

class Perceptron:

  def __init__(self):
    random.seed(1)

  def __predict(self, row, w):
    activation = w[0]
    for i in range(len(row)):
      activation += row[i] * w[i + 1]
    return 1 if activation >= 0 else 0

  def train(self, X, y, learningRate=0.01, epoch=0):
    w = [random.gauss(0, 0.1) for i in range(X.shape[1] + 1)]

    performance = []
    iter = 0

    while True:
      iter += 1

      sumError = 0
      prevW = list(w)
      for index, row in enumerate(X):
        prediction = self.__predict(row, w)
        
        error = y[index] - prediction
        sumError += error**2
        j = learningRate * error
        w[0] += j
        for i in range(len(row)):
          w[i + 1] += j * row[i]

      performance.append((iter, sumError, prevW[1:], prevW[0]))

      if iter == epoch or sumError == 0:
        break

    return performance

In [0]:
data = prepeare_data(wine_data, ['alcohol', 'pH'], 8, 3)
X, y = get_Xy(data)

p = Perceptron()
performance = p.train(X.values, y.values, learningRate=0.01, epoch=0)

print(performance[-1])

c) Plot performance

In [0]:
from google.colab import files
import matplotlib.pyplot as plt

def plot_performance(performance, wine_data, good_thresh, bad_thresh, epoch=-1, save_plot=False, x_f='alcohol', y_f='pH'):
  # Selecting data and creating plots
  data = prepeare_data(wine_data, [x_f, y_f], good_thresh, bad_thresh)
  fig, axes = plt.subplots(ncols=2, figsize=(25, 10))

  #
  # Errors as a function of epoch
  #

  axes[0].set_title('Errors as a function of epoch')
  axes[0].set_xlabel('epoch')
  axes[0].set_ylabel('Classification errors')
  errors = list(map(lambda x: x[1], performance))
  axes[0].plot(range(0, performance[-1][0]), errors)

  #
  # Decision boundary
  #

  axes[1].set_title('Decision boundary on epoch: %i' % (performance[-1][0] if epoch < 1 else epoch))
  axes[1].set_xlabel(x_f)
  axes[1].set_ylabel(y_f)

  # Spliting bad and good wines
  bad_wines = data.loc[(data.quality <= bad_thresh)]
  good_wines = data.loc[(data.quality >= good_thresh)]

  # Displaying wines
  axes[1].scatter(bad_wines[x_f], bad_wines[y_f], c='red', label="bad wines (<%i score)" % (bad_thresh + 1))
  axes[1].scatter(good_wines[x_f], good_wines[y_f], c='green', label="good wines (>%i score)" % (good_thresh - 1))

  # Displaying decision boundary on epoch
  p = performance[-1] if epoch < 1 else performance[epoch - 1]
  plot_x = pd.np.array([min(data[x_f]) - 0.1, max(data[x_f]) + 0.1])
  if (p[2][1]) == 0:
    plot_y = [0] * len(plot_x)
  else:
    plot_y = (-1/p[2][1]) * (p[2][0] * plot_x + p[3])
  axes[1].plot(plot_x, plot_y, 'b--', label = "Decision Boundary")

  # Seting (x, y) plot limits
  axes[1].set_xlim(min(data[x_f] - 0.1), max(data[x_f] + 0.1))
  axes[1].set_ylim(min(data[y_f] - 0.1), max(data[y_f] + 0.1))

  # Shading
  axes[1].fill_between(plot_x, plot_y, min(data[y_f]) - 0.1, color='green', alpha=0.2)
  axes[1].fill_between(plot_x, plot_y, max(data[y_f]) + 0.1, color='red', alpha=0.2)

  # Legend
  axes[1].legend(loc='upper left', bbox_to_anchor=(1, 1))
  axes[1].margins(0)

  if save_plot:
    plt.savefig("performance.png", dpi=200)
    files.download("performance.png")

  plt.show()

In [0]:
plot_performance(performance, wine_data, 8, 3, epoch=-1)

d) Future Scaling

In [0]:
import pandas as pd
import statistics as st

pd.options.mode.chained_assignment = None

def std(df, features):
  ret = df.copy()
  for f in features:
    mean = st.mean(ret[f])
    std = st.stdev(ret[f])
    for i in range(0, len(ret[f])):
      ret[f][i] = (ret[f][i] - mean) / std
  return ret

In [0]:
scaled = std(wine_data, ['alcohol', 'pH'])
print(scaled)
data = prepeare_data(scaled, ['alcohol', 'pH'], 8, 3)
X, y = get_Xy(data)

p = Perceptron()
performance = p.train(X.values, y.values, learningRate=0.01)

print(performance[-1])
plot_performance(performance, scaled, 8, 3)

# V.3: My fair ADALINE

a) Why?

Our perceptron could not converge when data is not linear seperable.

In [0]:
scaled = std(wine_data, ['alcohol', 'pH'])
data = prepeare_data(scaled, ['alcohol', 'pH'], 7, 4)
X, y = get_Xy(data)

perf = p.train(X.values, y.values, epoch=1000)

print(perf[-1])
plot_performance(perf, scaled, 7, 4)

b) & c) ADALINE

In [0]:
import random
import math as m

class Adaline:
  def __init__(self):
    random.seed(5)

  def _net_input(self, row, w):
    linear = 0
    for feature, wi in zip(row, w):
      linear += feature * wi
    if linear < 0:
      return 1 - 1/ (1 + m.exp(linear))
    else:
      return 1 / (1 + m.exp(-linear))

  def _update_w(self, X, w, errors, lr):
    for w_index, weight in enumerate(w):
      j = 0
      for f_index, feature in enumerate(X.iloc[:, w_index]):
        j += feature * errors[f_index]
      w[w_index] += lr * j
    return w

  def _calculate_cost(self, errors):
    cost = []
    for e in errors:
      cost.append(e**2)
    return sum(cost) / 2

  def _online_epoch(self, X, y, w, lr):
    cost = []
    for i in range(X.shape[0]):
      output = self._net_input(X.iloc[i], w)
      error = y.iat[i] - output
      cost.append(error**2)
      for index, weight in enumerate(w):
        w[index] += lr * X.iat[i, index] * error
    return w, sum(cost) / 2

  def _batch_epoch(self, X, y , w):
    output = []
    for _, row in X.iterrows():
      output.append(self._net_input(row, w))

    errors = []
    for index, out in enumerate(output):
      errors.append(y.iat[index] - out)

    return errors

  def train(self, X, y, lr=0.01, epoch=0, online=False):
    X = X.copy()
    X.insert(0, 'Bias', 1)

    w = [random.gauss(0, 0.001) for i in range(X.shape[1])]

    iter = 0
    perf = []

    while True:
      iter += 1
      
      if online:
        w, cost = self._online_epoch(X, y, w, lr)
        perf.append((iter, cost, w[1:], w[0]))
      else:
        errors = self._batch_epoch(X, y, w)
        cost = self._calculate_cost(errors)
        perf.append((iter, cost, w[1:], w[0]))
        w = self._update_w(X, w, errors, lr)
      
      if iter == epoch or (epoch == 0 and iter != 1 and abs(cost - perf[-2][1]) < 0.0001):
        break

    return perf

  def test(self, X, y, w, bias):
    X = X.copy()
    w = list(w)
    
    X.insert(0, 'Bias', 1)
    w.insert(0, bias)

    output = []
    for _, row in X.iterrows():
      a = self._net_input(row, w)
      output.append(self.predict(a))

    correct = []
    for index, out in enumerate(output):
      correct.append(1 if y.iat[index] == out else 0)

    return sum(correct) / len(correct) * 100.0

  def activate(self, row, w):
    return _net_input(row, w)

  def predict(self, activated):
    return 1 if activated >= 0.5 else 0

In [0]:
scaled = std(wine_data, ['alcohol', 'pH'])
data = prepeare_data(scaled, ['alcohol', 'pH'], 7, 4)
X, y = get_Xy(data)

a = Adaline()
perf = a.train(X, y, lr=0.01, online=True)

print(perf[-1])
plot_performance(perf, scaled, 7, 4)

d) Best learning rate

In [0]:
perf = a.train(X, y, lr=0.1, epoch=100, online=False)
print(perf[-1])
plot_performance(perf, scaled, 7, 4)

In [0]:
perf = a.train(X, y, lr=0.001, epoch=100, online=False)
print(perf[-1])
plot_performance(perf, scaled, 7, 4)

In [0]:
perf = a.train(X, y, lr=0.0001, epoch=100, online=False)
print(perf[-1])
plot_performance(perf, scaled, 7, 4)

Result: best learning rate is 0.01

# V.4: Advanced wine sampling and resampling

In [0]:
scaled = std(wine_data, ['alcohol', 'pH'])
data = prepeare_data(scaled, ['alcohol', 'pH'], 7, 4)

a) Holdout

In [0]:
def holdout(data, train_size=0.7):
  shuffled = data.sample(frac=1).reset_index(drop=True)
  train_len = int(shuffled.shape[0] * train_size)
  return shuffled.iloc[:train_len], shuffled.iloc[train_len:]

In [0]:
train, test = holdout(data, train_size=0.8)
print(train)
print(test)

b) k-fold

In [0]:
def kfold(data, k=5, shuffle=True):
  d = data.sample(frac=1).reset_index(drop=True) if shuffle else data.copy()

  folds = []
  for i in range(k):
    if i < d.shape[0] % k:
      fold_size = int(d.shape[0] // k + 1)
    else:
      fold_size = int(d.shape[0] / k)

    train = d.iloc[:i*fold_size].append(d.iloc[i*fold_size+fold_size:])
    test = d.iloc[i*fold_size:i*fold_size+fold_size]

    folds.append((train, test))

  return folds

In [0]:
folds = kfold(data, 3)
print(folds)

c) ADALINE with cross-validation

In [0]:
import statistics as st

def cross_eval(folds, lr=0.01, epoch=1000, online=False):
  a = Adaline()

  accuracy_train = []
  accuracy_test = []
  for fold in folds:
    X, y = get_Xy(fold[0])
    perf = a.train(X, y, lr=lr, epoch=epoch, online=online)
    accuracy_train.append(a.test(X, y, perf[-1][2], perf[-1][3]))
    X, y = get_Xy(fold[1])
    accuracy_test.append(a.test(X, y, perf[-1][2], perf[-1][3]))
  print('Train: {}, Test: {}'.format(st.mean(accuracy_train), st.mean(accuracy_test)))

In [0]:
folds = kfold(data)

print('lr=0.01')
cross_eval(folds, lr=0.01, epoch=50, online=False)
cross_eval(folds, lr=0.01, epoch=100, online=False)

print('\nlr=0.07')
cross_eval(folds, lr=0.07, epoch=50, online=False)
cross_eval(folds, lr=0.07, epoch=100, online=False)

print('\nlr=0.5')
cross_eval(folds, lr=0.5, epoch=50, online=False)
cross_eval(folds, lr=0.5, epoch=100, online=False)

print('\nlr=1')
cross_eval(folds, lr=1, epoch=50, online=False)
cross_eval(folds, lr=1, epoch=100, online=False)

# V.5: Adventures in the Nth dimension

a) Adaline on different wine factors

In [0]:
features = list(wine_data)[:-2]
scaled = std(wine_data, features)

In [0]:
#good
data = prepeare_data(scaled, ['alcohol', 'pH'], 7, 4)

folds = kfold(data, k=5)
cross_eval(folds, epoch=0)

In [0]:
#best
data = prepeare_data(scaled, ['alcohol', 'volatile acidity', 'sulphates'], 7, 4)

folds = kfold(data, k=5)
cross_eval(folds, epoch=0)

In [0]:
#bad
data = prepeare_data(scaled, ['chlorides', 'fixed acidity'], 7, 4)

folds = kfold(data, k=5)
cross_eval(folds, epoch=0)

In [0]:
#good
data = prepeare_data(scaled, features, 7, 4)

folds = kfold(data, k=5)
cross_eval(folds, epoch=500)

b) Decision boundary for 3 and even more features

* 2 features - line
* 3 features - surface
* 4 features - I do not know !

# V.6: Marvin's rebuttal

a) Pan-Galactic Gargle Blaster

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

data_url = 'https://raw.githubusercontent.com/SashaKryzh/ft_sommelier/master/Pan%20Galactic%20Gargle%20Blaster.csv'
g_data = pd.read_csv(data_url, delimiter=';')

good_galactic = 6
g_data['good'] = g_data['quality'].apply(lambda x: 1 if x >= good_galactic else 0)

g_data_scaled = std(g_data, ['wonderflonium', 'fallian marsh gas'])

In [0]:
to_plot = g_data_scaled[['wonderflonium', 'fallian marsh gas', 'quality']];
plot_scatter_matrix(to_plot, 5, 4, save_plot=False)

In [0]:
import math

def cart2pol(x, y):
  r = (x**2 + y**2).pow(1 / 2)
  t = [math.atan2(y, x) for x, y in zip(x, y)]
  return r, t

g_data_scaled['r'], g_data_scaled['t'] = cart2pol(g_data_scaled['fallian marsh gas'], g_data_scaled.wonderflonium)

In [0]:
to_plot = g_data_scaled[['wonderflonium', 'fallian marsh gas', 'r', 't', 'quality']];
plot_scatter_matrix(to_plot, 5, 4, save_plot=True)

In [0]:
X = g_data_scaled[['fallian marsh gas', 'r']].copy()
y = g_data_scaled.good.copy()

a = Adaline()
perf = a.train(X, y, lr=0.01, epoch=0, online=False)

print(perf[-1])
plot_performance(perf, g_data_scaled, 5, 4, x_f='fallian marsh gas', y_f='r')

# VI.1 Gotta go fast!

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import array

data_url = 'https://raw.githubusercontent.com/SashaKryzh/ft_sommelier_resources/master/winequality-red.csv'
wine_data = pd.read_csv(data_url, delimiter=';')

good_wine = 6
wine_data['good'] = wine_data['quality'].apply(lambda x: 1 if x >= good_wine else 0)

scaled = std(wine_data, ['alcohol', 'pH'])
data = prepeare_data(scaled, ['alcohol', 'pH'], 8, 3)
X, y = get_Xy(data)

In [0]:
%load_ext Cython

Perceptron

In [0]:
%%cython

from libc.stdlib cimport malloc, free, rand, srand, RAND_MAX
from cpython cimport array
import array

cdef struct s_perceptron:
  float *weights
  float bias
  int n_weights
ctypedef s_perceptron t_perceptron

cdef int heaviside(float activity_sum):
  return 1 if activity_sum >= 0 else 0

cdef int predict(t_perceptron p, row):
  cdef float a = p.bias
  cdef int i
  for i in range(p.n_weights):
    a += row[i] * p.weights[i]
  return heaviside(a)

cpdef train(X, y, float lr=0.01, int epoch=0):
  cdef t_perceptron p
  cdef int i

  p.n_weights = X.shape[1]
  p.weights = <float*>malloc(p.n_weights * sizeof(float))

  srand(1)
  for i in range(p.n_weights):
    p.weights[i] = rand() / RAND_MAX / 1000
  p.bias = rand() / RAND_MAX / 1000

  cdef int iter = 0
  perf = []

  cdef int sumError
  cdef int pred
  cdef int error

  while True:
    iter += 1
    sumError = 0

    for index, row in X.iterrows():
      pred = predict(p, row)
      error = y[index] - pred
      sumError += error**2
  
      p.bias += lr * error
      for i in range(p.n_weights):
        p.weights[i] += lr * error * row[i]

    perf.append((iter, sumError, [w for w in p.weights[:p.n_weights]], p.bias))

    if (epoch == 0 and sumError == 0) or iter == epoch:
      break

  free(p.weights)

  return perf

In [0]:
perf = train(X, y, lr=0.01, epoch=0)
print(perf[-1])

In [0]:
plot_performance(perf, scaled, 8, 3)

ADALINE

In [0]:
%%cython

from libc.stdlib cimport malloc, free, rand, srand, RAND_MAX
from cpython cimport array
import array

import math as m

cdef struct s_adaline:
  float *weights
  float bias
  int n_weights
ctypedef s_adaline t_adaline

# Init
cdef init_adaline(t_adaline *a, X, int seed=1):
  srand(seed)
  a[0].bias = rand() / RAND_MAX / 10
  a[0].n_weights = X.shape[1]
  a[0].weights = <float*>malloc(X.shape[1] * sizeof(float))
  for i in range(X.shape[1]):
    a[0].weights[i] = rand() / RAND_MAX / 10

# Deinit
cdef deinit_adaline(t_adaline *a):
  free(a.weights)

# Sigmoid
cdef float sigmoid(linear):
  if (linear < 0):
    return 1 - 1 / (1 + m.exp(linear))
  else:
    return 1 / (1 + m.exp(-linear))

# Activate
cdef float activate(t_adaline *a, row):
  cdef int i
  linear = a.bias
  for i in range(a.n_weights):
    linear += row[i] * a.weights[i]
  return sigmoid(linear)

cdef online_epoch(t_adaline *a, X, y, float lr):
  cost = 0
  cdef int i
  cdef float output
  cdef float error

  for index, row in X.iterrows():
    output = activate(a, row)
    error = y[index] - output
    cost += error**2 / 2
    a.bias += lr * error
    for i in range(a.n_weights):
      a.weights[i] += lr * row[i] * error
  return cost


# Train
cpdef train(X, y, float lr=0.01, int epoch=0, int online=0):
  cdef t_adaline a
  init_adaline(&a, X)

  cdef int iter = 0
  perf = []

  cdef int i
  cdef int ii
  cdef int z

  cdef float *errors = <float*>malloc(X.shape[0] * sizeof(float))

  while True:
    iter += 1

    if online == 0: # Batch
      for index, row in X.iterrows():
        errors[index] = activate(&a, row) * -1
        errors[index] += y[index]
      
      cost = 0
      for i in range(X.shape[0]):
        cost += pow(errors[i], 2)
      cost /= 2

      perf.append((iter, cost, [w for w in a.weights[:a.n_weights]], a.bias))

      sumError = 0
      for z in range(X.shape[0]):
        sumError += errors[z]
      a.bias += lr * sumError
      for i in range(a.n_weights):
        j = 0
        for ii in range(X.shape[0]):
          j += X.iat[ii, i] * errors[ii]
        a.weights[i] += lr * j

    else: # Online
      cost = online_epoch(&a, X, y, lr)
      perf.append((iter, cost, [w for w in a.weights[:a.n_weights]], a.bias))
    
    if epoch == iter or (iter != 1 and abs(cost - perf[-2][1]) < 0.0001):
      break

  deinit_adaline(&a)
  free(errors)

  return perf

In [0]:
perf_adaline = train(X, y, online=1)
print(perf_adaline[-1])

In [0]:
plot_performance(perf_adaline, scaled, 8, 3)

# VI.2 Do perceptrons dream of electric sheep?

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

data_url = 'https://raw.githubusercontent.com/SashaKryzh/ft_sommelier_resources/master/winequality-red.csv'
wine_data = pd.read_csv(data_url, delimiter=';')

good_wine = 6
wine_data['good'] = wine_data['quality'].apply(lambda x: 1 if x >= good_wine else 0)

scaled = std(wine_data, ['alcohol', 'pH'])
data = prepeare_data(scaled, ['alcohol', 'pH'], 7, 4)
X, y = get_Xy(data)

a = Adaline()
perf = a.train(X, y, lr=0.001, online=True)

print(perf[-1])
plot_performance(perf, scaled, 7, 4)

In [0]:
from google.colab import files
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from IPython.display import HTML

def plot_performance_animated(performance, wine_data, good_thresh, bad_thresh, save_plot=False, x_f='alcohol', y_f='pH'):
  # Selecting data
  data = prepeare_data(wine_data, [x_f, y_f], good_thresh, bad_thresh)
  bad_wines = data.loc[(data.quality <= bad_thresh)]
  good_wines = data.loc[(data.quality >= good_thresh)]

  # Creating plot
  fig, axes = plt.subplots(ncols=2, figsize=(25, 10))
  plt.close()

  #
  # Errors as a function of epoch
  #

  axes[0].set_title('Errors as a function of epoch')
  axes[0].set_xlabel('epoch')
  axes[0].set_ylabel('Classification errors')
  
  errors = list(map(lambda x: x[1], performance))

  #
  # Decision boundary
  #

  axes[1].set_xlabel(x_f)
  axes[1].set_ylabel(y_f)

  # Seting (x, y) plot limits
  axes[1].set_xlim(min(data[x_f] - 0.1), max(data[x_f] + 0.1))
  axes[1].set_ylim(min(data[y_f] - 0.1), max(data[y_f] + 0.1))

  axes[0].set_xlim(0, performance[-1][0] + 5)
  axes[0].set_ylim(0, max(errors) + 10)

  # To anim
  fe ,= axes[0].plot([], [], lw=3)
  db ,= axes[1].plot([], [], 'b--', label = "Decision Boundary", lw=2)
  axes[1].scatter(bad_wines[x_f], bad_wines[y_f], c='red', label="bad wines (<%i score)" % (bad_thresh + 1))
  axes[1].scatter(good_wines[x_f], good_wines[y_f], c='green', label="good wines (>%i score)" % (good_thresh - 1))

  # Legend
  axes[1].legend(loc='upper left', bbox_to_anchor=(1, 1))
  axes[1].margins(0)

  # Animation
  def init():
    fe.set_data([], [])
    db.set_data([], [])
    return db, fe

  plot_x = pd.np.array([min(data[x_f]) - 0.1, max(data[x_f]) + 0.1])
  def animate(i):
    fe.set_data(range(performance[i][0]), errors[:i + 1])

    p = performance[i]
    if (p[2][1]) == 0:
      plot_y = [0] * len(plot_x)
    else:
      plot_y = (-1/p[2][1]) * (p[2][0] * plot_x + p[3])
    db.set_data(plot_x, plot_y)
    
    axes[1].collections.clear()
    axes[1].set_title('Decision boundary on epoch: %i' % (i + 1))
    axes[1].scatter(bad_wines[x_f], bad_wines[y_f], c='red', label="bad wines (<%i score)" % (bad_thresh + 1))
    axes[1].scatter(good_wines[x_f], good_wines[y_f], c='green', label="good wines (>%i score)" % (good_thresh - 1))
    axes[1].fill_between(plot_x, plot_y, data[y_f].min() - 0.1, color='green', alpha=0.2)
    axes[1].fill_between(plot_x, plot_y, data[y_f].max() + 0.1, color='red', alpha=0.2)
    
    return db, fe

  anim = animation.FuncAnimation(fig, animate, init_func=init, frames=performance[-1][0], blit=True)

  if save_plot:
    anim.save("performance.mp4", "ffmpeg")
    files.download("performance.mp4")

  # Note: below is the part which makes it work on Colab
  rc('animation', html='jshtml')
  return anim

In [0]:
plot_performance_animated(perf, scaled, 7, 4, save_plot=True)

# VI.3 Dimensional traveler

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

data_url = 'https://raw.githubusercontent.com/SashaKryzh/ft_sommelier_resources/master/winequality-red.csv'
wine_data = pd.read_csv(data_url, delimiter=';')

good_wine = 6
wine_data['good'] = wine_data['quality'].apply(lambda x: 1 if x >= good_wine else 0)

scaled = std(wine_data, ['alcohol', 'residual sugar', 'volatile acidity'])
data = prepeare_data(scaled, ['alcohol', 'residual sugar', 'volatile acidity'], 7, 4)
X, y = get_Xy(data)

a = Adaline()
perf = a.train(X, y, lr=0.01, online=True)

print(perf[-1])
plot_performance(perf, scaled, 7, 4)

In [0]:
from google.colab import files
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

def plot_boundary_3d(performance, ddata, good_thresh, bad_thresh, epoch=-1, save_plot=False, features=('alcohol', 'residual sugar', 'volatile acidity')):
  data = prepeare_data(ddata, list(features), good_thresh, bad_thresh)  
  bad_wines = data.loc[(data.quality <= bad_thresh)]
  good_wines = data.loc[(data.quality >= good_thresh)]

  fig = plt.figure(figsize=(10, 10))
  ax = Axes3D(fig)

  ax.set_xlabel(features[0])
  ax.set_ylabel(features[1])
  ax.set_zlabel(features[2])
  epoch = epoch if epoch > 0 else performance[-1][0]
  ax.set_title('Decision boundary on epoch: {}'.format(epoch))

  ax.scatter(bad_wines.iloc[:, 0], bad_wines.iloc[:, 1], bad_wines.iloc[:, 2], c='red')
  ax.scatter(good_wines.iloc[:, 0], good_wines.iloc[:, 1], good_wines.iloc[:, 2], c='green')

  ax.set_xlim(min(data[features[0]]), max(data[features[0]]))
  ax.set_ylim(min(data[features[1]]), max(data[features[1]]))
  ax.set_zlim(min(data[features[2]]), max(data[features[2]]))

  w1, w2, w3 = performance[epoch - 1][2]
  bias = performance[epoch - 1][3]
  xs = pd.np.linspace(data[features[0]].min() - 0.1, data[features[0]].max() + 0.1)
  ys = pd.np.linspace(data[features[1]].min() - 0.1, data[features[1]].max() + 0.1)
  xx, yy = pd.np.meshgrid(xs, ys)
  zs = (-w1 * xx - w2 * yy - bias) / w3

  ax.plot_surface(xs, ys, zs)

  if save_plot:
    plt.savefig("boundary3D.png", dpi=200)
    files.download("boundary3D.png")

  plt.show()

In [0]:
plot_boundary_3d(perf, scaled, 7, 4, save_plot=False)