## Wine Quality estimation

In [1]:
import torch
import os
import numpy as np
import pandas as pd

In [2]:
file = "../data/p1ch4/tabular-wine/winequality-white.csv"
wineq_numpy = pd.read_csv(file, sep=',', delimiter=';')
wineq_numpy = wineq_numpy.to_numpy()
wineq_numpy.shape, type(wineq_numpy)

((4898, 12), numpy.ndarray)

In [3]:
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float64)

In [4]:
# Removing 'Y' variable
target = wineq[:, -1].long() # converting to int64
data = wineq[:, :-1]
data.shape, data.dtype, target.shape, target.dtype

(torch.Size([4898, 11]), torch.float64, torch.Size([4898]), torch.int64)

In [5]:
target = target.long()
target.dtype

torch.int64

We can achieve one-hot encoding using the **scatter_** method, which fills the tensor with values from a source tensor along the indices provided as arguments:
The arguments for **scatter_**  are as follows:
* The dimension along which the following two arguments are specified
* A column tensor indicating the indices of the elements to scatter
* A tensor containing the elements to scatter or a single scalar to scatter (1, in
this case)

In [6]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)
target_onehot.shape

torch.Size([4898, 10])

In [7]:
print(target[5], target_onehot[5])

tensor(6) tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])


In [8]:
# Manipulating the data
data_mean = torch.mean(data, dim=0)
data_std = torch.std(data, dim=0)
data_mean.shape, data_std.shape, data.shape

(torch.Size([11]), torch.Size([11]), torch.Size([4898, 11]))

In [9]:
data_normalized = (data - data_mean)/data_std
data_normalized.shape

torch.Size([4898, 11])

In [13]:
bad_indexes = target <= 3
bad_indexes, bad_indexes.dtype, bad_indexes.sum()

(tensor([False, False, False,  ..., False, False, False]),
 torch.bool,
 tensor(20))