# Training a Classifier on the *Salammbô* Dataset with PyTorch
Author: Pierre Nugues

We use three classes: French, English, and German

We first need to import some modules

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [2]:
X = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ])

We add German data and we adjust `y`

In [3]:
X_de = np.array(
    [[37599, 1771], [44565, 2116], [16156, 715], [37697, 1804],
     [29800, 1865], [42606, 2146], [78242, 3813], [40341, 1955],
     [31030, 1993], [26676, 1346], [39250, 1902], [41780, 2106],
     [72545, 4597], [79195, 3988], [19020, 928]
     ])

X = np.vstack((X, X_de))

y = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [4]:
indices = np.arange(45)
np.random.shuffle(indices)
indices

array([40, 19, 34, 12,  3, 29, 10, 20, 35, 41,  2, 13, 31, 39, 18, 26, 25,
       38, 23, 44, 30, 32, 36,  5, 24, 33, 43,  9, 16, 27, 28,  6, 14,  7,
        1, 11,  0,  8, 37, 21, 22, 15, 42,  4, 17])

In [5]:
X = X[indices, :]
y = y[indices]
y

array([2, 1, 2, 0, 0, 1, 0, 1, 2, 2, 0, 0, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2,
       2, 0, 1, 2, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 0,
       1])

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [6]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
X_norm = normalizer.fit_transform(X)
X_norm[:4]

array([[0.99882795, 0.0484018 ],
       [0.99774592, 0.06710504],
       [0.99804736, 0.06246169],
       [0.9979983 , 0.06324072]])

### Standardizing

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X_norm)
X_scaled[:4]

array([[ 1.58883716, -1.60257285],
       [-0.73955946,  0.72879853],
       [-0.09088476,  0.15000209],
       [-0.19645636,  0.24710915]])

In [8]:
X_scaled = torch.Tensor(X_scaled)
y = torch.LongTensor(y)
y

tensor([2, 1, 2, 0, 0, 1, 0, 1, 2, 2, 0, 0, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 0,
        1, 2, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 0, 1])

## Creating a Model

We set a seed to have reproducible results

In [9]:
np.random.seed(1337)

We create a classifier equivalent to a logistic regression. With PyTorch, the crossentropy loss computes the softmax of the outputs. We do not add an activation in the last layer. 

The outputs are then called, rather improperly, logits. For a clarification on this terrible terminology, see here: https://stackoverflow.com/questions/41455101/what-is-the-meaning-of-the-word-logits-in-tensorflow/52111173#52111173 

In [10]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 3)
        
    def forward(self, x):
        x = self.fc1(x)
        return x

Or with one hidden layer

In [11]:
class Model2(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 10)
        self.fc2 = nn.Linear(10, 3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

To try the network with one hidden layer, set `complex` to true

In [12]:
complex = True

In [13]:
input_dim = X_scaled.shape[1]
if not complex:
    model = Model(input_dim)
else:
    model = Model2(input_dim)
loss_fn = nn.CrossEntropyLoss()    # cross entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

## Fitting the Model

In [14]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_scaled, y)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

We fit the model

In [15]:
for epoch in range(100):
    loss_train = 0
    for X_scaled_batch, y_batch in dataloader:
        y_batch_pred = model(X_scaled_batch)
        loss = loss_fn(y_batch_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_train += loss.item()
    if epoch % 10 == 0:
        print(loss_train/len(y))
print(loss_train/len(y))

0.9317493650648329
0.5588877361267806
0.44014442722416586
0.3636842472996149
0.3173683798613234
0.28952390769393077
0.2709604480277954
0.25857085826185844
0.2482782617664068
0.23932697284294085
0.2336660644058914


### The weights

In [16]:
model.state_dict()

OrderedDict([('fc1.weight',
              tensor([[-3.5928e-01,  3.3271e-01],
                      [ 9.9213e-01, -9.1046e-01],
                      [ 1.3038e+00, -6.2517e-02],
                      [ 1.0074e-01,  6.1619e-01],
                      [ 1.4695e+00, -3.0734e-01],
                      [ 4.3213e-01, -7.4602e-01],
                      [-1.1247e+00,  3.5175e-01],
                      [-1.3331e+00,  1.0375e+00],
                      [ 1.0232e-01,  1.6791e-04],
                      [-1.0063e+00,  3.6245e-01]])),
             ('fc1.bias',
              tensor([-0.2754,  0.0943,  1.2864, -0.5712,  1.6248, -0.4456,  0.4527,  0.0055,
                      -0.2473,  0.0170])),
             ('fc2.weight',
              tensor([[-2.2804e-01, -7.0409e-01,  8.0370e-01,  2.7427e-01,  7.7009e-01,
                       -4.1772e-01, -1.0822e-01, -3.2071e-01, -1.5380e-03, -7.0085e-02],
                      [ 4.9562e-01, -2.2085e-01, -1.2069e+00, -2.0575e-01, -1.5458e+00,
             

## Prediction
### Probabilities

We compute the probabilities to belong to the classes for all the training set

In [17]:
model.eval()

Model2(
  (fc1): Linear(in_features=2, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=3, bias=True)
)

The output with no activation

In [18]:
Y_pred_logits = model(X_scaled)
Y_pred_logits[:4]

tensor([[  3.9234, -12.9865,   9.7991],
        [  0.1311,   2.3797,  -2.7473],
        [  2.3955,  -3.6677,   1.3206],
        [  2.0336,  -2.6955,   0.6480]], grad_fn=<SliceBackward0>)

The probabilities

In [19]:
Y_pred_proba = F.softmax(model(X_scaled), dim=-1)
Y_pred_proba[:4]

tensor([[2.7991e-03, 1.2680e-10, 9.9720e-01],
        [9.4966e-02, 8.9969e-01, 5.3394e-03],
        [7.4423e-01, 1.7318e-03, 2.5404e-01],
        [7.9428e-01, 7.0174e-03, 1.9870e-01]], grad_fn=<SliceBackward0>)

In [29]:
y[:4]

tensor([2, 1, 2, 0])

We recompute it with matrices

In [20]:
m_params = list(model.parameters())

In [21]:
if complex:
    print(torch.softmax(torch.relu(X_scaled @ m_params[0].T + m_params[1]) @ m_params[2].T + m_params[3], dim=-1)[:4])
else:
    print(torch.softmax(X_scaled @ m_params[0].T + m_params[1], dim=-1)[:4])

tensor([[2.7991e-03, 1.2680e-10, 9.9720e-01],
        [9.4966e-02, 8.9969e-01, 5.3394e-03],
        [7.4423e-01, 1.7318e-03, 2.5404e-01],
        [7.9428e-01, 7.0174e-03, 1.9870e-01]], grad_fn=<SliceBackward0>)


### Classes

In [22]:
y_pred = torch.argmax(Y_pred_proba, dim=-1)
y_pred

tensor([2, 1, 0, 0, 0, 1, 0, 1, 2, 2, 0, 0, 2, 2, 1, 1, 1, 0, 1, 2, 2, 2, 2, 0,
        1, 2, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 0, 0, 1])

## Loss
We recompute the loss

For one observation

In [23]:
loss_fn(Y_pred_logits[0], y[0])

tensor(0.0028, grad_fn=<NllLossBackward0>)

In [24]:
-torch.log(Y_pred_proba[0])[y[0]]

tensor(0.0028, grad_fn=<NegBackward0>)

For the dataset

In [25]:
loss_fn(Y_pred_logits, y)

tensor(0.2286, grad_fn=<NllLossBackward0>)

In [26]:
-torch.mean(torch.log(Y_pred_proba[range(0, len(y)), y]))

tensor(0.2286, grad_fn=<NegBackward0>)

## Evaluation

With sklearn

In [27]:
from sklearn.metrics import classification_report

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91        15
           1       1.00      1.00      1.00        15
           2       1.00      0.80      0.89        15

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.94      0.93      0.93        45



We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.