In [57]:
# importing all the neccessary libraries
import torch
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [127]:
# Defining Multi-Layered Perceptron (MLP) Model
class mlp():
  def __init__(self, input_size, output_size, hidden_size=(1,), random_seed=None):
    if random_seed is not None:
      torch.manual_seed(random_seed)

    self.input_size = input_size
    self.output_size = output_size
    self.hidden_size = hidden_size
    self.hidden_layers = len(hidden_size)
    self.weights = []
    self.bias = []

    # Initialising Weights and Bias with requires_grad=True to keep track of derivation/gradients of functions
    for i in range(self.hidden_layers):
      if i == 0:
        self.weights.append(torch.randn(self.input_size, self.hidden_size[i], requires_grad=True))
      else:
        self.weights.append(torch.randn(self.hidden_size[i-1], self.hidden_size[i], requires_grad=True))

      self.bias.append(torch.randn(self.hidden_size[i], requires_grad=True))

    self.weights.append(torch.randn(self.hidden_size[-1], self.output_size, requires_grad=True))
    self.bias.append(torch.randn(self.output_size, requires_grad=True))

  def forward(self, x):
    for i in range(self.hidden_layers):
      x = self.linear(x, self.weights[i], self.bias[i])
      x = self.relu(x)
    x = self.linear(x, self.weights[-1], self.bias[-1])
    x = self.softmax(x)
    return x

  # Defining Linear Function
  def linear(self, x, w, b):
    return x @ w + b

  # Defining ReLu Activation Function
  def relu(self, x):
    return torch.max(x, torch.zeros_like(x))

  # Defining Softmax Activation Funtion
  def softmax(self, x):
    x = x - torch.max(x, dim=-1, keepdim=True)[0]
    return torch.exp(x) / torch.exp(x).sum(dim=-1, keepdim=True)

  def fit(self, X, y, epochs=100, lr=0.01):
    n = X.shape[0]

    for epoch in range(epochs):
      # Using SGD to update weights and bias
      for i in range(n):
        x = X[i]
        y_pred = self.forward(x)

        loss = -torch.log(y_pred[y[i]]) # Cross-Entropy Loss Funtion

        loss.backward() # Backtracking Gradients

        with torch.no_grad(): # Stops keeping tracks of gradients
          for w in self.weights:
            w -= lr * w.grad # Gives gradient w.r.t. the weight
            w.grad.zero_() # resets the gradient
          for b in self.bias:
            b -= lr * b.grad # Gives gradient w.r.t. the bias
            b.grad.zero_() # resets the gradient

  def predict(self, X):
    with torch.no_grad():
      y_pred = []
      for x in X:
        y_pred.append(self.forward(x).argmax().item()) # gives the output with highest probability
      return torch.tensor(y_pred, dtype=torch.long)

  def predict_proba(self, X):
    with torch.no_grad():
      y_pred = []
      for x in X:
        y_pred.append(self.forward(x)) # gives probability of all outputs
      return torch.round(torch.stack(y_pred) * 100) / 100 # returning all the probabilty after rounding it off to second place of the decimal

In [128]:
# Load dataset
wine = load_wine()
X, y = wine.data, wine.target  # X: features, y: target (class labels)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardise features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [129]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((142, 13), (142,), (36, 13), (36,))

In [130]:
y_test

array([0, 0, 2, 0, 1, 0, 1, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 2, 2, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0])

In [131]:
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)

In [132]:
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

In [133]:
model_wine = mlp(input_size=13, hidden_size=(8,4), output_size=3, random_seed=0) # Initialising model
model_wine.fit(X_train, y_train, epochs=1000, lr=0.005) # Training model

In [134]:
y_pred = model_wine.predict(X_test) # Predicting on test data
y_pred, y_test

(tensor([0, 0, 2, 0, 1, 0, 1, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 2,
         2, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0]),
 tensor([0, 0, 2, 0, 1, 0, 1, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 2,
         2, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0]))

In [135]:
(y_pred != y_test).sum() # counting wrong predictions

tensor(0)

In [136]:
y_pred_proba = model_wine.predict_proba(X_test) # Predicting probabilities on test data
y_pred_proba

tensor([[1.0000, 0.0000, 0.0000],
        [1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.0000],
        [1.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [1.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 0.0000, 1.0000],
        [0.0100, 0.9900, 0.0000],
        [0.0000, 0.0000, 1.0000],
        [1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.0000],
        [0.8900, 0.1100, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [1.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [1.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [1.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 1.0000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [0.0000, 1.0000, 0.0000],
        [1.000

In [137]:
y_pred_proba.sum(dim=-1) # checking if the probabilties add up to one or not

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])