
This notebook is intended to explore various pseudolabelling schemes. Validation, model, data isn't hugely important here, so the cells are collapsed. Keep in mind this is a minimal example without much of the techniques discussed, and this dataset is very basic - adding augmentations, stochastic depth, etc. during training would result in better results. This is intended to be a minimal code example.

Reproducibility
To make this as fair a comparison as possible, I have seeded random weights and all pseudolabels are produced from the same set of weights.

In [12]:
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
    
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from keras import  backend as K
import tensorflow as tf
from sklearn.metrics import accuracy_score

import os
import random

Load Train and Test data and cross validation

In [13]:
def standardize(x): 
    return (x-mean_px)/std_px

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

train = pd.read_csv("digit-recognizer_train.csv")
test= pd.read_csv("digit-recognizer_test.csv")
X_train = (train.iloc[:,1:].values).astype('float32') # all pixel values
y_train = train.iloc[:,0].values.astype('int32') # only labels i.e targets digits
X_test = test.values.astype('float32')
mean_px = X_train.mean().astype(np.float32)
std_px = X_train.std().astype(np.float32) 

y_train = tf.keras.utils.to_categorical(y_train)
num_classes = y_train.shape[1]

# fix random seed for reproducibility
seed_everything(seed=42)

X_test = X_test.reshape(X_test.shape[0], 28, 28,1)
X_train = X_train.reshape(X_train.shape[0], 28, 28,1)

# cross validation
X = X_train
y = y_train
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

mnist_test = pd.read_csv("mnist_test.csv")
mnist_train = pd.read_csv("mnist_train.csv")
ground_truth = pd.read_csv("digit-recognizer_sample_submission.csv")

cols = test.columns

test['dataset'] = 'test'

train['dataset'] = 'train'

dataset = pd.concat([train.drop('label', axis=1), test]).reset_index()

mnist = pd.concat([mnist_train, mnist_test]).reset_index(drop=True)
labels = mnist['label'].values
mnist.drop('label', axis=1, inplace=True)
mnist.columns = cols

idx_mnist = mnist.sort_values(by=list(mnist.columns)).index
dataset_from = dataset.sort_values(by=list(mnist.columns))['dataset'].values
original_idx = dataset.sort_values(by=list(mnist.columns))['index'].values

for i in range(len(idx_mnist)):
    if dataset_from[i] == 'test':
        ground_truth.loc[original_idx[i], 'Label'] = labels[idx_mnist[i]]
        
def get_test_acc(model):
    predictions = model.predict(X_test, verbose=0)
    predictions = np.argmax(predictions,axis=1)

    submissions = pd.DataFrame({"ImageId": list(range(1,len(predictions)+1)),
                                "Label": predictions})
    return accuracy_score(ground_truth['Label'].values, submissions['Label'].values)

In [14]:
verbosity = 0

def get_model():
    input_1 = tf.keras.layers.Input((28,28,1))
    x = tf.keras.layers.Lambda(standardize)(input_1)
    x = tf.keras.layers.Convolution2D(32,(3,3), activation='relu')(x)
    x = tf.keras.layers.BatchNormalization(axis=1)(x)
    x = tf.keras.layers.Convolution2D(32,(3,3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D()(x)
    x = tf.keras.layers.BatchNormalization(axis=1)(x)
    x = tf.keras.layers.Convolution2D(64,(3,3), activation='relu')(x)
    x = tf.keras.layers.BatchNormalization(axis=1)(x)
    x = tf.keras.layers.Convolution2D(64,(3,3), activation='relu')(x)
    x = tf.keras.layers.MaxPooling2D()(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    out = tf.keras.layers.Dense(10, activation='softmax')(x)
    model = tf.keras.Model(inputs=input_1, outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

No pseudolabelling baseline

In [15]:
ckp = tf.keras.callbacks.ModelCheckpoint(f'baseline.hdf5', monitor='val_accuracy', verbose=0,
                                         save_best_only=True, save_weights_only=True, mode='max')
model = get_model()
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, verbose=verbosity, callbacks=[ckp])

model.load_weights('baseline.hdf5') # load best weights
no_pseudo_acc = get_test_acc(model)
print(f"No pseudolabelling accuracy: {format(no_pseudo_acc, '.5g')}")



No pseudolabelling accuracy: 0.99218


Self training
First, we train on the labelled data, then produce pseudolabels and finetune on the pseudolabels.

In [16]:
model = get_model()
model.load_weights('baseline.hdf5')

pseudolabels = model.predict(X_test, verbose=0) # create our pseudolabels
pseudolabels = np.argmax(pseudolabels,axis=1) # convert probabilities into classes
pseudolabels = tf.keras.utils.to_categorical(pseudolabels) 

model.optimizer.lr = 1e-4 # reduce learning rate since we are finetuning

ckp = tf.keras.callbacks.ModelCheckpoint(f'selftrain.hdf5', monitor='val_accuracy', verbose=0, save_best_only=True,
                                         save_weights_only=True, mode='max')
model.fit(X_test, pseudolabels, validation_data=(X_val, y_val), epochs=10, batch_size=32,
          verbose=verbosity, callbacks=[ckp])

model.load_weights('selftrain.hdf5') # load best weights
self_train_acc = get_test_acc(model)
print(f"Self training accuracy: {format(self_train_acc, '.5g')}")



Self training accuracy: 0.99218


Simultaneous training
First, we train on the labelled data, then initialize a new model and train with labelled data and pseudolabels simultaneously.

In [6]:
model = get_model()
model.load_weights('baseline.hdf5')

pseudolabels = model.predict(X_test, verbose=0) # create our pseudolabels
pseudolabels = np.argmax(pseudolabels,axis=1) # convert probabilities into classes
pseudolabels = tf.keras.utils.to_categorical(pseudolabels) 
y_combined = np.concatenate([pseudolabels, y_train]) # combine our pseudolabels with labelled data
X_combined = np.concatenate([X_test, X_train]) 



In [7]:
ckp = tf.keras.callbacks.ModelCheckpoint('simultaneous_train.hdf5', monitor='val_accuracy', verbose=0, save_best_only=True, save_weights_only=True, mode='max')

model = get_model() # reinitialize model
model.fit(X_combined, y_combined, validation_data=(X_val, y_val), epochs=20, batch_size=32, callbacks=[ckp], verbose=verbosity) # train a new model on all data together

model.load_weights('simultaneous_train.hdf5') # load best weights

simultaneous_acc = get_test_acc(model) # get test accuracy
print(f"Simultaneous training accuracy: {format(simultaneous_acc, '.5g')}")



Simultaneous training accuracy: 0.99207


Pretraining
First, we train on labelled data, then we create pseudolabels.

Next, we initialize a new model and train it on

In [8]:
model = get_model()
model.load_weights('baseline.hdf5')

pseudolabels = model.predict(X_test, verbose=0) # create our pseudolabels
pseudolabels = np.argmax(pseudolabels,axis=1) # convert probabilities into classes
pseudolabels = tf.keras.utils.to_categorical(pseudolabels) 



In [9]:
ckp = tf.keras.callbacks.ModelCheckpoint('pretrain.hdf5', monitor='val_accuracy', verbose=0, save_best_only=True, save_weights_only=True, mode='max')

print("Pretrain on pseudolabels")
model = get_model() # reinitialize model
model.fit(X_test, pseudolabels, validation_data=(X_val, y_val), epochs=15, batch_size=32, callbacks=[ckp], verbose=verbosity) # first train on pseudolabels only

print("Finetune on labelled data")
model.optimizer.lr = 1e-4 # reduce learning rate since we are finetuning
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, callbacks=[ckp], verbose=verbosity) # finetune on labelled data

model.load_weights('pretrain.hdf5') # load best weights
pretrain_acc = get_test_acc(model) # get test accuracy
print(f"Pretraining accuracy: {pretrain_acc}")

Pretrain on pseudolabels




Finetune on labelled data
Pretraining accuracy: 0.9923928571428572


Conclusion
In my talk, I explained the use cases for various pseudolabelling methods. Even though MNIST is not a particularly complex dataset and it's not very fit for pseudolabelling, we still see an improvement over baseline. MNIST's test set is only about half the size of the train set.

In [10]:
print(f"No pseudolabelling accuracy: {no_pseudo_acc}")
print(f"Self training accuracy: {self_train_acc}")
print(f"Simultaneous training accuracy: {simultaneous_acc}")
print(f"Pretraining accuracy: {pretrain_acc}")

print("-"*30)

print(f"Percent difference from no pseudolabelling to self training: {100*(self_train_acc-no_pseudo_acc)/no_pseudo_acc}%")
print(f"Percent difference from self training to simultaneous training: {100*(simultaneous_acc-self_train_acc)/self_train_acc}%")
print(f"Percent difference from simultaneous training to pretraining: {100*(pretrain_acc-simultaneous_acc)/simultaneous_acc}%")

print("-"*30)

print(f"Percent difference from no pseudolabelling to pretraining: {100*(pretrain_acc-no_pseudo_acc)/no_pseudo_acc}%" )


No pseudolabelling accuracy: 0.9921785714285715
Self training accuracy: 0.9921785714285715
Simultaneous training accuracy: 0.9920714285714286
Pretraining accuracy: 0.9923928571428572
------------------------------
Percent difference from no pseudolabelling to self training: 0.0%
Percent difference from self training to simultaneous training: -0.010798747345308352%
Percent difference from simultaneous training to pretraining: 0.03239974080207481%
------------------------------
Percent difference from no pseudolabelling to pretraining: 0.021597494690616705%
