In [1]:
import torch, pickle, argparse
import numpy as np, pytorch_lightning as pl, matplotlib.pyplot as plt, eagerpy as ep
from models import ConvNet
from data_loader import load_train_data, load_test_data
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from tqdm.notebook import tqdm

In [2]:
filename = 'first_tests.pickle'

with open(filename, 'rb') as file:
    loaded = pickle.load(file)
    
data = np.array(loaded[0][0]).reshape(3,6,3,2)
hparams = argparse.Namespace(**data[2,-1,1,1])


TEST_PATH = "s2_mnist.gz"
TRAIN_PATH = "s2_mnist_train_dwr_20000.gz"

train_data = load_train_data(TRAIN_PATH)
test_data = load_test_data(TEST_PATH)

train_images = train_data[:][0]
train_labels = train_data[:][1]

test_images = test_data[:][0]
test_labels = test_data[:][1]


model = ConvNet(hparams, train_data, test_data).eval()
trainer = pl.Trainer(gpus=1, logger=False)
best_model = torch.load('checkpoints/epoch=17_v6.ckpt')
model.load_state_dict(best_model['state_dict'])
orig_model_acc = best_model['checkpoint_callback_best_model_score']

# drop dense part:
model.dense = torch.nn.Identity()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


In [3]:
model

ConvNet(
  (loss_function): CrossEntropyLoss()
  (conv): Sequential(
    (0): Conv2d(1, 12, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(12, 15, kernel_size=(5, 5), stride=(1, 1))
    (3): ReLU()
    (4): Conv2d(15, 16, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Conv2d(16, 85, kernel_size=(7, 7), stride=(2, 2))
    (7): ReLU()
    (8): Conv2d(85, 141, kernel_size=(5, 5), stride=(1, 1))
    (9): ReLU()
    (10): Conv2d(141, 191, kernel_size=(3, 3), stride=(1, 1))
    (11): ReLU()
    (12): Conv2d(191, 1100, kernel_size=(3, 3), stride=(2, 2))
    (13): ReLU()
  )
  (dense): Identity()
)

In [3]:
n_samples = len(train_images)
bs = 10000

In [4]:
with tqdm(total=n_samples//bs) as pbar:
    output_conv_train = model(train_images[:bs]).detach()
    pbar.update(1)

    for i in range(1,n_samples//bs):
        dummy_output_conv_train = model(train_images[bs*i:bs*(i+1)]).detach()
        output_conv_train = torch.cat((output_conv_train, dummy_output_conv_train), axis=0)
        pbar.update(1)

  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
output_conv_test = model(test_images).detach()

In [20]:
clf = SGDClassifier()

In [12]:
n_samples = len(train_images)
bs = 100
eps = 0
patience = 5
max_iterations = 100

assert n_samples % bs == 0, f'The batch size ({bs}) must be a divisor of number of samples ({n_samples}).'

train also on test_data only

In [19]:
def train_SGD_clf(clf, output_conv_train, train_labels, output_conv_test, test_labels, bs,
                  patience=10, eps=0., max_iterations=100):
    
    classes = np.unique(train_labels)
    accs = []
    max_acc = -eps
    pat_counter = 0

    for iteration in tqdm(range(max_iterations)):

        for i in range(n_samples//bs):
            clf.partial_fit(output_conv_train[bs*i:bs*(i+1)], train_labels[bs*i:bs*(i+1)], classes=classes)

        accs.append(clf.score(output_conv_test, test_labels))

        if accs[-1] > max_acc + eps:
            max_acc = accs[-1]
            pat_counter = 0
            best_coef = clf.coef_.copy()

        else:
            pat_counter +=1

            if pat_counter >= patience:
                clf.coef_ = best_coef.copy()
                break

    if pat_counter < patience:
        clf.coef_ = best_coef.copy()
        
    return accs

In [21]:
accs = train_SGD_clf(clf, output_conv_train, train_labels, output_conv_test, test_labels, bs)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [25]:
print(accs, '\n')
print(clf.score(output_conv_test, test_labels))

[0.9264, 0.9338, 0.9347, 0.9038, 0.9192, 0.8819, 0.9524, 0.9536, 0.9468, 0.9449, 0.9376, 0.9389, 0.9365, 0.9416, 0.9354, 0.9545, 0.92, 0.9462, 0.9401, 0.9435, 0.9384, 0.9443, 0.9341, 0.9418, 0.9263, 0.9401] 

0.9545


In [None]:
# set_params(**params)

In [26]:
clf = SGDClassifier()
clf.fit(output_conv_test, test_labels)
clf_output = torch.Tensor(clf.predict(output_conv_test))
accuracy_score(clf_output, test_labels)

0.9868

In [4]:
linSVM = svm.LinearSVC()
linSVM.fit(output_conv_test, test_labels)
svm_output = torch.Tensor(linSVM.predict(output_conv_test))
accuracy_score(svm_output, test_labels)



0.9999

|                    | train_samples | test_accuracy |
|--------------------|---------------|---------------|
| svm                | 10000         | 0.9377        |
| SGD                | 10000         | 0.948         |
| svm (loss='hinge') | 10000         | 0.9376        |
| SGD (tol=0.0001)   | 10000         | 0.0199        |
| SGD                | 20000         | 0.9545        |

SGD fitted on the test data: test_accuracy = 0.9868
linSVM fitted on the test data: test_accuracy = 0.9999

Should I implement standardization before the classifier?

I will use SGDClassifier. With loss='hinge', which is the standard, this fits a (linear) SVM (i.e. without kernels), but since it is Stochastic Gradient Descent, one can use batch training.