In [None]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.image as mpimg
import io
import json
import numpy as np
import time
import datetime
import functools
import matplotlib.pyplot as plt
import sys
import os
import random
from sklearn.metrics import accuracy_score, confusion_matrix
from noise_generate import get_noisy_labels

In [None]:
# Model / data parameters
input_shape = (28, 28, 1)
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")


x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


In [None]:
noise_ratio_list = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.45, 0.5, 0.55, 0.6, 0.65]
noise_type_list = ['uniform', 'class-dependent', 'locally-concentrated']  # corresponding to three noise types: NCAR, NAR, NNAR

In [None]:
if not os.path.isdir('fashionMNIST_sampled_data'):
    os.mkdir('fashionMNIST_sampled_data')
if not os.path.isdir('fashionMNIST_logits_and_preds'):
    os.mkdir('fashionMNIST_logits_and_preds')
if not os.path.isdir('fashionMNIST_noisy_data'):
    os.mkdir('fashionMNIST_noisy_data')

In [None]:
def sample_class(x_train, y_train, x_test, y_test, train_idxes, test_idxes, classes):
    # sampling instances by sampled classes
    x_train_sampled_class = list()
    y_train_sampled_class = list()
    x_test_sampled_class = list()
    y_test_sampled_class = list()
    train_sampled_class_idxes = list()
    test_sampled_class_idxes = list()

    # For training set
    for i in range(0, len(train_idxes)):
      if y_train[i] in classes:
        x_train_sampled_class.append(x_train[i])
        y_train_sampled_class.append(y_train[i])
        train_sampled_class_idxes.append(train_idxes[i])
    # For testing set
    for i in range(0, len(test_idxes)):
      if y_test[i] in classes:
        x_test_sampled_class.append(x_test[i])
        y_test_sampled_class.append(y_test[i])
        test_sampled_class_idxes.append(test_idxes[i])

    return np.array(x_train_sampled_class), np.array(y_train_sampled_class), np.array(x_test_sampled_class), np.array(y_test_sampled_class), train_sampled_class_idxes, test_sampled_class_idxes
  



In [None]:
def sample_dataset(x_train, y_train, x_test, y_test, frac):
    # get idxes list of each class in training set and testing set
    class_idx_dict_train = dict()
    for idx in range(0, y_train.shape[0]):
        if y_train[idx] not in class_idx_dict_train.keys(): class_idx_dict_train[y_train[idx]] = list()
        class_idx_dict_train[y_train[idx]].append(idx)

    class_idx_dict_test = dict()
    for idx in range(0, y_test.shape[0]):
        if y_test[idx] not in class_idx_dict_test.keys(): class_idx_dict_test[y_test[idx]] = list()
        class_idx_dict_test[y_test[idx]].append(idx)


    # sampling indexes
    class_idx_dict_train_sampled = dict()
    for cls in class_idx_dict_train.keys():
        cur_list = class_idx_dict_train[cls]
        sampled_list = random.sample(cur_list, int(frac * len(cur_list)))
        class_idx_dict_train_sampled[cls] = sampled_list


    class_idx_dict_test_sampled = dict()
    for cls in class_idx_dict_test.keys():
        cur_list = class_idx_dict_test[cls]
        sampled_list = random.sample(cur_list, int(frac * len(cur_list)))
        class_idx_dict_test_sampled[cls] = sampled_list


    # sampling instances by sampled indexes
    x_train_sampled = list()
    y_train_sampled = list()
    x_test_sampled = list()
    y_test_sampled = list()
    train_sampled_idxes = list()
    test_sampled_idxes = list()

    for idxes_list in class_idx_dict_train_sampled.values():
        train_sampled_idxes.extend(idxes_list)
    for idxes_list in class_idx_dict_test_sampled.values():
        test_sampled_idxes.extend(idxes_list)

    random.shuffle(train_sampled_idxes)
    random.shuffle(test_sampled_idxes)

    for idx in train_sampled_idxes:
        x_train_sampled.append(x_train[idx])
        y_train_sampled.append(y_train[idx])
    for idx in test_sampled_idxes:
        x_test_sampled.append(x_test[idx])
        y_test_sampled.append(y_test[idx])
    
    return np.array(x_train_sampled), np.array(y_train_sampled), np.array(x_test_sampled), np.array(y_test_sampled), train_sampled_idxes, test_sampled_idxes



In [None]:
dataset = 'fashionMNIST'

**Full Size Dataset**

10 classes

In [None]:
num_classes = 10
datasize = 'full'
classStr = ''


x_train_full, y_train_full, x_test_full, y_test_full, train_full_idx, test_full_idx = sample_dataset(x_train, y_train, x_test, y_test, 1)
# np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize +'_idx', train_full_idx)
# np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize +'_idx', test_full_idx)


In [None]:
num_classes = 10
datasize = 'full'
classStr = ''


x_train_full, y_train_full, x_test_full, y_test_full, train_full_idx, test_full_idx = sample_dataset(x_train, y_train, x_test, y_test, 1)
np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize +'_idx', train_full_idx)
np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize +'_idx', test_full_idx)


model_full = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_full.summary())
batch_size = 1
epochs = 1   # 30
model_full.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_full.fit(x_train_full, y_train_full, batch_size=batch_size, epochs=epochs, validation_split=0.2)

predict_res_full = model_full.predict(x_test_full)
y_pred_full = np.argmax(predict_res_full,axis=1)
accuracy_full = accuracy_score(y_test_full, y_pred_full)

model_penultimate_full = tf.keras.Model(model_full.layers[0].input, model_full.layers[-2].output)  # model soft
model_last_full = model_full.layers[-1]
logits_train_full = model_penultimate_full(x_train_full)
soft_pred_train_full = model_last_full(logits_train_full)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_logits_train', logits_train_full)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_train', soft_pred_train_full)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_test', predict_res_full)


for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_full, y_train_full, x_test_full, y_test_full, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_'+ str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)


**Frac = 0.5**

10 classes

In [None]:
num_classes = 10
datasize = 'frac5'
classStr = ''


x_train_frac5, y_train_frac5, x_test_frac5, y_test_frac5, train_frac5_idx, test_frac5_idx = sample_dataset(x_train, y_train, x_test, y_test, 0.5)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize +'_idx', train_frac5_idx)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize +'_idx', test_frac5_idx)



model_frac5 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_frac5.summary())
batch_size = 128
epochs = 20   # 30
model_frac5.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_frac5.fit(x_train_frac5, y_train_frac5, batch_size=batch_size, epochs=epochs, validation_split=0.2)

predict_res_frac5 = model_frac5.predict(x_test_frac5)
y_pred_frac5 = np.argmax(predict_res_frac5,axis=1)
accuracy_frac5 = accuracy_score(y_test_frac5, y_pred_frac5)


model_penultimate_frac5 = tf.keras.Model(model_frac5.layers[0].input, model_frac5.layers[-2].output)
model_last_frac5 = model_frac5.layers[-1]
logits_train_frac5 = model_penultimate_frac5(x_train_frac5)
soft_pred_train_frac5 = model_last_frac5(logits_train_frac5)
np.save('content/drive/MyDrive/result_new/fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_logits_train', logits_train_frac5)
np.save('content/drive/MyDrive/result_new/fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_train', soft_pred_train_frac5)
np.save('content/drive/MyDrive/result_new/fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_test', predict_res_frac5)


for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac5, y_train_frac5, x_test_frac5, y_test_frac5, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('content/drive/MyDrive/result_new/fashionMNIST_noisy_data/' + 'fashionMNIST' + '_' + str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)

**Frac = 0.5**

5 classes 

In [None]:
# Model / data parameters
input_shape = (28, 28, 1)
# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")


x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


In [None]:
classes = [0,1,2,3,4,5,6,7,8,9]
label_map = dict()     # encode the labels
for i in range(0, len(classes)):
  label_map[classes[i]] = i




y_true_idx =np.load( 'fashionMNIST_sampled_data/fashionMNIST_10cls_train_frac3_idx.npy')
y_true = y_train[y_true_idx]
y_noise = np.load('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_10cls' + '_frac3' + '_' + 'uniform' + '_65.npy')
cnt = 0


# encode the clean y_train label
for i in range(0, y_true.shape[0]):
  prev_label = y_true[i]
  # print('prev_label:', prev_label)
  y_true[i] = label_map[prev_label]
  # print('cur_label:', label_map[i]])


for i in range(0, len(y_true)):
  if y_true[i] != y_noise[i]:
    cnt += 1
print(cnt/len(y_true))

0.65


In [None]:
num_classes = 5
# classStr = '02468' # alternatives: '03568', '12479', '13478', '13579'
# classes = [0,2,4,6,8] # alternatives: [0,3,5,6,8], [1,2,4,7,9], [1,3,4,7,8], [1,3,5,7,9]


classes_list = [[0,2,4,6,8], [0,3,5,6,8], [1,2,4,7,9], [1,3,4,7,8], [1,3,5,7,9]]
classStr_list = ['02468', '03568', '12479', '13478', '13579']


for i in range(0, len(classes_list)):
  classes = classes_list[i]
  classStr = classStr_list[i]
  
  datasize_list = ['frac4', 'frac3']
  class_frac_list = [4/5, 3/5]
  for j in range(0, len(datasize_list)):
    class_frac = class_frac_list[j]
    datasize = datasize_list[j]
                           
    x_train_full, y_train_full, x_test_full, y_test_full, train_full_idx, test_full_idx = sample_dataset(x_train, y_train, x_test, y_test, class_frac)
    x_train_frac5_5cls, y_train_frac5_5cls, x_test_frac5_5cls, y_test_frac5_5cls, train_frac5_5cls_idx, test_frac5_5cls_idx = sample_class(x_train_full, y_train_full, x_test_full, y_test_full, train_full_idx, test_full_idx, classes)
    np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize +'_idx', train_frac5_5cls_idx)
    np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize +'_idx', test_frac5_5cls_idx)

    # encode the labels
    label_map = dict()
    for i in range(0, len(classes)):
      label_map[classes[i]] = i
    for i in range(0, y_train_frac5_5cls.shape[0]):
      prev_label = y_train_frac5_5cls[i]
      y_train_frac5_5cls[i] = label_map[prev_label]
    for i in range(0, y_test_frac5_5cls.shape[0]):
      prev_label = y_test_frac5_5cls[i]
      y_test_frac5_5cls[i] = label_map[prev_label]



    model_frac5_5cls = keras.Sequential(
        [
            keras.Input(shape=input_shape),
            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dense(32),
            layers.Dense(num_classes),
            layers.Activation('softmax')
        ]
    )

    print(model_frac5_5cls.summary())
    batch_size = 128
    epochs = 20   # 30
    model_frac5_5cls.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
    model_frac5_5cls.fit(x_train_frac5_5cls, y_train_frac5_5cls, batch_size=batch_size, epochs=epochs, validation_split=0.2)


    predict_res_frac5_5cls = model_frac5_5cls.predict(x_test_frac5_5cls)
    y_pred_frac5_5cls = np.argmax(predict_res_frac5_5cls,axis=1)
    accuracy_frac5_5cls = accuracy_score(y_test_frac5_5cls, y_pred_frac5_5cls)



    model_penultimate_frac5_5cls = tf.keras.Model(model_frac5_5cls.layers[0].input, model_frac5_5cls.layers[-2].output)
    model_last_frac5_5cls = model_frac5_5cls.layers[-1]
    logits_train_frac5_5cls = model_penultimate_frac5_5cls(x_train_frac5_5cls)
    soft_pred_train_frac5_5cls = model_last_frac5_5cls(logits_train_frac5_5cls)
    np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_logits_train', logits_train_frac5_5cls)
    np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_train', soft_pred_train_frac5_5cls)
    np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_test', predict_res_frac5_5cls)


    for noise_type in noise_type_list:
      for noise_ratio in noise_ratio_list:
        y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac5_5cls, y_train_frac5_5cls, x_test_frac5_5cls, y_test_frac5_5cls, num_classes, datasize, noise_type, noise_ratio, classStr)
        np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_' + str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)

**Frac = 0.2**


10 classes

In [None]:
num_classes = 10
datasize = 'frac2'
classStr = ''

In [None]:
x_train_frac2, y_train_frac2, x_test_frac2, y_test_frac2, train_frac2_idx, test_frac2_idx = sample_dataset(x_train, y_train, x_test, y_test, 0.2)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_10cls_train_'+ datasize +'_idx', train_frac2_idx)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_10cls_test_'+ datasize +'_idx', test_frac2_idx)


model_frac2 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_frac2.summary())
batch_size = 56
epochs = 20   # 30
model_frac2.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_frac2.fit(x_train_frac2, y_train_frac2, batch_size=batch_size, epochs=epochs, validation_split=0.2)


predict_res_frac2 = model_frac2.predict(x_test_frac2)
y_pred_frac2 = np.argmax(predict_res_frac2,axis=1)
accuracy_frac2 = accuracy_score(y_test_frac2, y_pred_frac2)



model_penultimate_frac2 = tf.keras.Model(model_frac2.layers[0].input, model_frac2.layers[-2].output)
model_last_frac2 = model_frac2.layers[-1]
logits_train_frac2 = model_penultimate_frac2(x_train_frac2)
soft_pred_train_frac2 = model_last_frac2(logits_train_frac2)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_10cls_logits_train', logits_train_frac2)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_10cls_soft_pred_train', soft_pred_train_frac2)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_10cls_soft_pred_test', predict_res_frac2)


for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac2, y_train_frac2, x_test_frac2, y_test_frac2, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_' + str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)


5 classes 

In [None]:
num_classes = 5
datasize = 'frac2'
classStr = '02468' # alternatives: '03568', '12479', '13478', '13579'
classes = [0,2,4,6,8] # alternatives: [0,3,5,6,8], [1,2,4,7,9], [1,3,4,7,8], [1,3,5,7,9]

In [None]:
x_train_frac4, y_train_frac4, x_test_frac4, y_test_frac4, train_frac4_idx, test_frac4_idx = sample_dataset(x_train, y_train, x_test, y_test, 0.4)
x_train_frac2_5cls, y_train_frac2_5cls, x_test_frac2_5cls, y_test_frac2_5cls, train_frac2_5cls_idx, test_frac2_5cls_idx = sample_class(x_train_frac4, y_train_frac4, x_test_frac4, y_test_frac4, train_frac4_idx, test_frac4_idx, classes)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize + '_idx', train_frac2_5cls_idx)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize + '_idx', test_frac2_5cls_idx)
# encode the labels
label_map = dict()
for i in range(0, len(classes)):
  label_map[classes[i]] = i
for i in range(0, y_train_frac2_5cls.shape[0]):
  prev_label = y_train_frac2_5cls[i]
  y_train_frac2_5cls[i] = label_map[prev_label]
for i in range(0, y_test_frac2_5cls.shape[0]):
  prev_label = y_test_frac2_5cls[i]
  y_test_frac2_5cls[i] = label_map[prev_label]




model_frac2_5cls = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_frac2_5cls.summary())
batch_size = 128
epochs = 20   # 30
model_frac2_5cls.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_frac2_5cls.fit(x_train_frac2_5cls, y_train_frac2_5cls, batch_size=batch_size, epochs=epochs, validation_split=0.2)

predict_res_frac2_5cls = model_frac2_5cls.predict(x_test_frac2_5cls)
y_pred_frac2_5cls = np.argmax(predict_res_frac2_5cls,axis=1)
accuracy_frac2_5cls = accuracy_score(y_test_frac2_5cls, y_pred_frac2_5cls)


model_penultimate_frac2_5cls = tf.keras.Model(model_frac2_5cls.layers[0].input, model_frac2_5cls.layers[-2].output)
model_last_frac2_5cls = model_frac2_5cls.layers[-1]
logits_train_frac2_5cls = model_penultimate_frac2_5cls(x_train_frac2_5cls)
soft_pred_train_frac2_5cls = model_last_frac2_5cls(logits_train_frac2_5cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_logits_train', logits_train_frac2_5cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_train', soft_pred_train_frac2_5cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_test', predict_res_frac2_5cls)

for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac2_5cls, y_train_frac2_5cls, x_test_frac2_5cls, y_test_frac2_5cls, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_' + str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)

2 classes

In [None]:
num_classes = 2
datasize = 'frac2'
classStr = '24' # alternatives: '26', '46', '06', '19', '47', '45', '07'
classes = [2,4] # alternatives: [2,6], [4,6], [0,6], [1,9], [4,7], [4,5], [0,7]

In [None]:
x_train_full, y_train_full, x_test_full, y_test_full, train_full_idx, test_full_idx = sample_dataset(x_train, y_train, x_test, y_test, 1)
x_train_frac2_2cls, y_train_frac2_2cls, x_test_frac2_2cls, y_test_frac2_2cls, train_frac2_2cls_idx, test_frac2_2cls_idx = sample_class(x_train_full, y_train_full, x_test_full, y_test_full, train_full_idx, test_full_idx, classes)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize + '_idx', train_frac2_2cls_idx)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize + '_idx', test_frac2_2cls_idx)
label_map = dict()
for i in range(0, len(classes)):
  label_map[classes[i]] = i
for i in range(0, y_train_frac2_2cls.shape[0]):
  prev_label = y_train_frac2_2cls[i]
  y_train_frac2_2cls[i] = label_map[prev_label]
for i in range(0, y_test_frac2_2cls.shape[0]):
  prev_label = y_test_frac2_2cls[i]
  y_test_frac2_2cls[i] = label_map[prev_label]



model_frac2_2cls = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_frac2_2cls.summary())
batch_size = 128
epochs = 20   # 30
model_frac2_2cls.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_frac2_2cls.fit(x_train_frac2_2cls, y_train_frac2_2cls, batch_size=batch_size, epochs=epochs, validation_split=0.2)


predict_res_frac2_2cls = model_frac2_2cls.predict(x_test_frac2_2cls)
y_pred_frac2_2cls = np.argmax(predict_res_frac2_2cls,axis=1)
accuracy_frac2_2cls = accuracy_score(y_test_frac2_2cls, y_pred_frac2_2cls)



model_penultimate_frac2_2cls = tf.keras.Model(model_frac2_2cls.layers[0].input, model_frac2_2cls.layers[-2].output)
model_last_frac2_2cls = model_frac2_2cls.layers[-1]
logits_train_frac2_2cls = model_penultimate_frac2_2cls(x_train_frac2_2cls)
soft_pred_train_frac2_2cls = model_last_frac2_2cls(logits_train_frac2_2cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_logits_train', logits_train_frac2_2cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_train', soft_pred_train_frac2_2cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_test', predict_res_frac2_2cls)


for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac2_2cls, y_train_frac2_2cls, x_test_frac2_2cls, y_test_frac2_2cls, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_'+ str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)

**Frac = 0.1**

10 classes

In [None]:
num_classes = 10
datasize = 'frac1'
classStr = ''

In [None]:
x_train_frac1, y_train_frac1, x_test_frac1, y_test_frac1, train_frac1_idx, test_frac1_idx = sample_dataset(x_train, y_train, x_test, y_test, 0.1)
np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr +'_train_'+ datasize + '_idx', train_frac1_idx)
np.save('fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr +'_test_'+ datasize + '_idx', test_frac1_idx)



model_frac1 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_frac1.summary())
batch_size = 56
epochs = 20   # 30
model_frac1.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_frac1.fit(x_train_frac1, y_train_frac1, batch_size=batch_size, epochs=epochs, validation_split=0.2)


predict_res_frac1 = model_frac1.predict(x_test_frac1)
y_pred_frac1 = np.argmax(predict_res_frac1,axis=1)
accuracy_frac1 = accuracy_score(y_test_frac1, y_pred_frac1)

model_penultimate_frac1 = tf.keras.Model(model_frac1.layers[0].input, model_frac1.layers[-2].output)
model_last_frac1 = model_frac1.layers[-1]
logits_train_frac1 = model_penultimate_frac1(x_train_frac1)

# logits_test_frac1 = model_penultimate_frac1(x_test_frac1)


soft_pred_train_frac1 = model_last_frac1(logits_train_frac1)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr +'_logits_train', logits_train_frac1)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr +'_soft_pred_train', soft_pred_train_frac1)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr +'_soft_pred_test', predict_res_frac1)


for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac1, y_train_frac1, x_test_frac1, y_test_frac1, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_' + str(num_classes) + 'cls_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)


5 classes 

In [None]:
num_classes = 5
datasize = 'frac1'
classStr = '02468' # alternatives: '03568', '12479', '13478', '13579'
classes = [0,2,4,6,8] # alternatives: [0,3,5,6,8], [1,2,4,7,9], [1,3,4,7,8], [1,3,5,7,9]

In [None]:
x_train_frac2, y_train_frac2, x_test_frac2, y_test_frac2, train_frac2_idx, test_frac2_idx = sample_dataset(x_train, y_train, x_test, y_test, 0.2)
x_train_frac1_5cls, y_train_frac1_5cls, x_test_frac1_5cls, y_test_frac1_5cls, train_frac1_5cls_idx, test_frac1_5cls_idx = sample_class(x_train_frac2, y_train_frac2, x_test_frac2, y_test_frac2, train_frac2_idx, test_frac2_idx, classes)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize +'_idx', train_frac1_5cls_idx)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize +'_idx', test_frac1_5cls_idx)
# encode the labels
label_map = dict()
for i in range(0, len(classes)):
  label_map[classes[i]] = i
for i in range(0, y_train_frac1_5cls.shape[0]):
  prev_label = y_train_frac1_5cls[i]
  y_train_frac1_5cls[i] = label_map[prev_label]
for i in range(0, y_test_frac1_5cls.shape[0]):
  prev_label = y_test_frac1_5cls[i]
  y_test_frac1_5cls[i] = label_map[prev_label]




model_frac1_5cls = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_frac1_5cls.summary())
batch_size = 128
epochs = 20   # 30
model_frac1_5cls.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_frac1_5cls.fit(x_train_frac1_5cls, y_train_frac1_5cls, batch_size=batch_size, epochs=epochs, validation_split=0.2)

predict_res_frac1_5cls = model_frac1_5cls.predict(x_test_frac1_5cls)
y_pred_frac1_5cls = np.argmax(predict_res_frac1_5cls,axis=1)
accuracy_frac1_5cls = accuracy_score(y_test_frac1_5cls, y_pred_frac1_5cls)


model_penultimate_frac1_5cls = tf.keras.Model(model_frac1_5cls.layers[0].input, model_frac1_5cls.layers[-2].output)
model_last_frac1_5cls = model_frac1_5cls.layers[-1]
logits_train_frac1_5cls = model_penultimate_frac1_5cls(x_train_frac1_5cls)
soft_pred_train_frac1_5cls = model_last_frac1_5cls(logits_train_frac1_5cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_logits_train', logits_train_frac1_5cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_train', soft_pred_train_frac1_5cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize +'_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_test', predict_res_frac1_5cls)


for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac1_5cls, y_train_frac1_5cls, x_test_frac1_5cls, y_test_frac1_5cls, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_' + str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)



2 classes  

In [None]:
num_classes = 2
datasize = 'frac1'
classStr = '24' # alternatives: '26', '46', '06', '19', '47', '45', '07'
classes = [2,4] # alternatives: [2,6], [4,6], [0,6], [1,9], [4,7], [4,5], [0,7]

In [None]:
x_train_frac5, y_train_frac5, x_test_frac5, y_test_frac5, train_frac5_idx, test_frac5_idx = sample_dataset(x_train, y_train, x_test, y_test, 0.5)
x_train_frac1_2cls, y_train_frac1_2cls, x_test_frac1_2cls, y_test_frac1_2cls, train_frac1_2cls_idx, test_frac1_2cls_idx = sample_class(x_train_frac5, y_train_frac5, x_test_frac5, y_test_frac5, train_frac5_idx, test_frac5_idx, classes)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_train_'+ datasize + '_idx', train_frac1_2cls_idx)
np.save('content/drive/MyDrive/result_new/fashionMNIST_sampled_data/fashionMNIST_'+ str(num_classes) + 'cls' + classStr + '_test_'+ datasize + '_idx', test_frac1_2cls_idx)
label_map = dict()
for i in range(0, len(classes)):
  label_map[classes[i]] = i
for i in range(0, y_train_frac1_2cls.shape[0]):
  prev_label = y_train_frac1_2cls[i]
  y_train_frac1_2cls[i] = label_map[prev_label]
for i in range(0, y_test_frac1_2cls.shape[0]):
  prev_label = y_test_frac1_2cls[i]
  y_test_frac1_2cls[i] = label_map[prev_label]



model_frac1_2cls = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dense(32),
        layers.Dense(num_classes),
        layers.Activation('softmax')
    ]
)

print(model_frac1_2cls.summary())
batch_size = 128
epochs = 20   # 30
model_frac1_2cls.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer="adam", metrics=["accuracy"])
model_frac1_2cls.fit(x_train_frac1_2cls, y_train_frac1_2cls, batch_size=batch_size, epochs=epochs, validation_split=0.2)


predict_res_frac1_2cls = model_frac1_2cls.predict(x_test_frac1_2cls)
y_pred_frac1_2cls = np.argmax(predict_res_frac1_2cls,axis=1)
accuracy_frac1_2cls = accuracy_score(y_test_frac1_2cls, y_pred_frac1_2cls)


model_penultimate_frac1_2cls = tf.keras.Model(model_frac1_2cls.layers[0].input, model_frac1_2cls.layers[-2].output)
model_last_frac1_2cls = model_frac1_2cls.layers[-1]
logits_train_frac1_2cls = model_penultimate_frac1_2cls(x_train_frac1_2cls)
soft_pred_train_frac1_2cls = model_last_frac1_2cls(logits_train_frac1_2cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_logits_train', logits_train_frac1_2cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_train', soft_pred_train_frac1_2cls)
np.save('fashionMNIST_logits_and_preds/fashionMNIST_'+ datasize + '_'+ str(num_classes) + 'cls' + classStr + '_soft_pred_test', predict_res_frac1_2cls)


for noise_type in noise_type_list:
  for noise_ratio in noise_ratio_list:
    y_train_noisy, probs = get_noisy_labels('fashionMNIST', x_train_frac1_2cls, y_train_frac1_2cls, x_test_frac1_2cls, y_test_frac1_2cls, num_classes, datasize, noise_type, noise_ratio, classStr)
    np.save('fashionMNIST_noisy_data/' + 'fashionMNIST' + '_'+ str(num_classes) + 'cls' + classStr + '_' + datasize + '_' + noise_type + '_' + str(int(noise_ratio*100)), y_train_noisy)
