# Import data

In [1]:
import pandas as pd

df = pd.read_csv('winequality-red.csv')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import seaborn as sns

sns.heatmap(df.corr(), annot=True)

In [None]:
sns.countplot(data=df, x='quality')

In [None]:
import matplotlib.pyplot as plt

for col in df.columns[:-1]:
    plt.figure(figsize=(10,4))
    sns.histplot(df[col], kde=True)

In [None]:
# Add other steps

## Data Manipulation (prep for model)¶

In [None]:
# Split the data
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X

In [None]:
y = (y > 5).astype(int)
y

In [None]:
from sklearn.model_selection import train_test_split
# First split:
# - 70% of the data goes to the training set
# - 30% of the data goes to a temporary set
train_x, ee_x, train_y, ee_y = train_test_split(X, y, test_size=0.3, random_state=42)
train_x.shape

In [None]:
# - 15% of the data goes to validation and 15% goes to testing
valid_x, test_x, valid_y, test_y = train_test_split(ee_x, ee_y, test_size=0.5, random_state=42)
valid_x.shape

In [None]:
test_x.shape

In [None]:
train_x

## Move data to PyTorch ecosystem

In [None]:
import torch

train_x = torch.tensor(train_x.to_numpy())
train_x

In [None]:
train_x = train_x.float()
train_x

In [None]:
test_x, valid_x = map(torch.tensor, (test_x.to_numpy(), valid_x.to_numpy()))
test_x = test_x.float()
valid_x = valid_x.float()

In [None]:
valid_x.shape

In [None]:
# Targets must be long in CrossEntropyLoss
train_y = torch.tensor(train_y.to_numpy(), dtype=torch.long)
valid_y = torch.tensor(valid_y.to_numpy(), dtype=torch.long)
test_y = torch.tensor(test_y.to_numpy(), dtype=torch.long)
train_y

In [None]:
from torch.utils.data import Dataset

class BasicDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return len(self.x)

In [None]:
train_data = BasicDataset(train_x, train_y)
valid_data = BasicDataset(valid_x, valid_y)
test_data = BasicDataset(test_x, test_y)
wine_basic_train = BasicDataset(train_x, train_y)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=wine_basic_train, batch_size=11)

# Creating a basic MLP

In [None]:
# Define the MLP model
class BasicMLP(torch.nn.Module):
    def __init__(self, n_inputs, hidden_size, n_outputs):
        super(BasicMLP, self).__init__()
        
        self.inputs = n_inputs
        self.hidden = hidden_size
        self.outputs = n_outputs
        
        self.fc1 = torch.nn.Linear(self.inputs, self.hidden)
        self.fc2 = torch.nn.Linear(self.hidden, self.outputs)
        self.relu = torch.nn.ReLU()
    
    def forward(self, X):
        out = self.fc1(X)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [None]:
train_x.shape[1]

In [None]:
model = BasicMLP(n_inputs=train_x.shape[1], hidden_size=5, n_outputs=2)

In [None]:
model

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.5)

In [None]:
import numpy as np

# Training loop

epochs = 100

loss_valid = []
loss_train = []

for epoch in range(epochs):
    model.train()
    # Batch the data
    epoch_loss_train = []
    for features, targets in train_loader:
        output = model.forward(features) # Get model hypotheses
        # Calculate a loss
        loss = criterion(output, targets)
        epoch_loss_train.append(loss.item())
        optimizer.zero_grad() # remove any previous losses
        loss.backward() # Calculate the current losses backward through the MLP
        optimizer.step() # Apply to the model itself
    # Calculate the training loss per epoch
    loss_train.append(sum(epoch_loss_train) / len(epoch_loss_train))
    
    # Per epoch, get validation performance
    model.eval() # Puts the model in evaluation mode
    valid_hyp = model.forward(valid_x)
    loss_valid.append(criterion(valid_hyp, valid_y).item())
    c = torch.argmax(valid_hyp.data, dim=1)
    valid_accuracy = (c == valid_y).sum().item() / valid_x.shape[0]
    print('Epoch', epoch, 'train_loss', loss_train[-1], 'valid_loss', loss_valid[-1],
          'validation accuracy:', valid_accuracy)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_loss_curves(epochs, loss_train, loss_valid):
    plt.plot(epochs, loss_train, label='Train Loss')
    plt.plot(epochs, loss_valid, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Cross Entropy Loss')
    plt.legend()
    plt.show()

plot_loss_curves(np.linspace(1, epochs, epochs).astype(int), loss_train, loss_valid)

In [None]:
c = torch.argmax(valid_hyp, dim=1)
valid_acc = (c == valid_y).sum().item() / valid_y.shape[0]
valid_acc

# Making changes to basic MLP

In [None]:
df['type'].value_counts()

In [None]:
# Add steps

# Optional: Implementing the MLP

In [None]:
# Add steps