# 📚 **Import Libraries**

In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import tensorflow_datasets as tfds
from matplotlib.pyplot import figure
from tensorflow.keras.layers.experimental import preprocessing
import seaborn as sns
from keras.datasets import mnist
import os.path
import gzip
import lzma
import torch
import codecs

QMNIST是MNIST的扩展，额外提供了50000个测数数据，包括6万个训练数据，6万个测试数据，总共有120000个数据。QMNIST的训练数据和MNIST的训练数据是一致的，测试数据前1000条和MNIST的测试数据是一致的。
不过，kaggle上提供的[QMNIST数据](https://www.kaggle.com/fedesoriano/qmnist-the-extended-mnist-dataset-120k-images)是整体120000个数据，并没有区分为训练数据和测数数据，而且，Digit Recongnizer提供的MNIST训练数据为48000条，测试数据为12000条。使用QMNIST额外数据来提升训练结果，需要将MNIST的测数数据从QMNIST中剔除。 但是kaggle QMNIST的数据顺序和MNIST的数据顺序不一致，所以需要使用原始QMNIST数据。
首先是比较分析kaggle Digit Recongnizer mnist 测试数据对应原始MNIST位置并标记，然后从QMNIST原始数据集中删除。
最终训练数据集从42000扩充到92000，有助于提升训练结果。

QMNIST is an extension of MNIST, providing 50000 additional test data, including 60000 training data and 60000 test data, with a total of 120000 data. QMNIST's training data is consistent with MNIST's training data, and the first 1000 test data are consistent with MNIST's test data.
However, the [qmnist data](https://www.kaggle.com/fedesoriano/qmnist-the-extended-mnist-dataset-120k-images) provided on kaggleIt is 120000 data as a whole, which is not divided into training data and test data. In addition, digital Recongnizer provides 48000 MNIST training data and 12000 test data. To use additional data of qmnist to improve training results, MNIST test data need to be removed from QMNIST.

In [None]:
# Random seeds
def set_seed(seed=0):
    np.random.seed(seed)
    tf.random.set_seed(seed)
set_seed()

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
## kaggle测试数据集，需要从QMNIST120000数据中剔除
test_data=pd.read_csv('../input/digit-recognizer/test.csv')
print(test_data.shape)
## kaggle原始数据，训练数据6000条，测试数据10000条
(train_images,train_labels),(test_images,test_labels) = mnist.load_data()
train_images=train_images.reshape(-1,28*28)
test_images=test_images.reshape(-1,28*28)
print(train_images.shape,test_images.shape)


In [None]:
##原始数据总和
all_images=pd.DataFrame(train_images).append(pd.DataFrame(test_images))
print(all_images.head)

In [None]:
all_labels=np.append(train_labels,test_labels)


In [None]:
sns.countplot(x=np.append(train_labels,test_labels))

In [None]:
## 为提取QMNIST数据准备函数
def open_maybe_compressed_file(path):
    if path.endswith('.gz'):
        return gzip.open(path, 'rb')
    elif path.endswith('.xz'):
        return lzma.open(path, 'rb')
    else:
        return open(path,'rb')
    
def get_int(b):
    return int(codecs.encode(b, 'hex'), 16)

def read_idx2_int(path):
    with open_maybe_compressed_file(path) as f:
        data = f.read()
        assert get_int(data[:4]) == 12*256 + 2
        length = get_int(data[4:8])
        width = get_int(data[8:12])
        parsed = np.frombuffer(data, dtype=np.dtype('>i4'), offset=12)
        return torch.from_numpy(parsed.astype('i4')).view(length,width).long()

def read_idx3_ubyte(path):
    with open_maybe_compressed_file(path) as f:
        data = f.read()
        assert get_int(data[:4]) == 8 * 256 + 3
        length = get_int(data[4:8])
        num_rows = get_int(data[8:12])
        num_cols = get_int(data[12:16])
        parsed = np.frombuffer(data, dtype=np.uint8, offset=16)
        #print(parsed.shape)
        return torch.from_numpy(parsed).view(length, num_rows, num_cols)


In [None]:
## 提取QMNIST数据，包括训练数据60000条，测试数据1000条，并且将数据和标签各自合并
qmnist_train_data = read_idx3_ubyte('/kaggle/input/qmnist/qmnist-train-images-idx3-ubyte')
qmnist_X_data=qmnist_train_data.numpy()
qmnist_test_data = read_idx3_ubyte('/kaggle/input/qmnist/qmnist-test-images-idx3-ubyte')
qmnist_t_data=qmnist_test_data.numpy()
qmnist_train_label=read_idx2_int('/kaggle/input/qmnist/qmnist-train-labels-idx2-int')
qmnist_y_train=qmnist_train_label.numpy()
qmnist_y_train=pd.DataFrame(qmnist_y_train)
#qmnist_y_train=qmnist_y_train.iloc[:,0]

qtd=pd.DataFrame(qmnist_t_data.reshape(-1,28*28))
qxd=pd.DataFrame(qmnist_X_data.reshape(-1,28*28))
qxd=qxd.append(qtd)



qmnist_test_label=read_idx2_int('/kaggle/input/qmnist/qmnist-test-labels-idx2-int')
qmnist_y_test=qmnist_test_label.numpy()
qmnist_y_test=pd.DataFrame(qmnist_y_test)
#qmnist_y_test=qmnist_y_test.iloc[:,0]
qmnist_y=qmnist_y_train.append(qmnist_y_test)
#qmnist_y=qmnist_y.iloc[:,0]



In [None]:
qmnist_y

In [None]:
sns.countplot(x=qmnist_y.iloc[:,0].values)

In [None]:
##查看一下QMNIST和MNIST数据是否一致
import matplotlib.pyplot as plt
from random import randint

random_num = randint(0, 70000)

fig=plt.figure(figsize=(2,2),facecolor='blue')
f = fig.add_subplot(2,2,1)
img = np.asarray(all_images.iloc[random_num ,0:].values.reshape((28,28))/255);
plt.imshow(img, cmap='gray')
f = fig.add_subplot(2,2,2)
img = np.asarray(qxd.iloc[random_num ,0:].values.reshape((28,28))/255);
plt.imshow(img, cmap='gray')

plt.show()

In [None]:
merage_images=pd.DataFrame(test_data.values).append(all_images)
check=merage_images.duplicated()

In [None]:
check.sum()

In [None]:
## 提取QMNIST测试集数据，和MNIST测试集合数据集
check=check[28000:98000]
qmnist_test=qxd[0:70000].values[check]


In [None]:
y_qmnist_test=qmnist_y[0:70000].values[check]

In [None]:
print(qmnist_test.shape)
print(y_qmnist_test.shape)

In [None]:
## 反转提取测试数据意外的训练数据
check=check==False
print(check.sum())
X_qmnist=pd.DataFrame(qxd[0:70000].values[check])
X_qmnist=X_qmnist.append(qxd[70000:])
print(X_qmnist)

In [None]:
y_qmnist=pd.DataFrame(qmnist_y[0:70000].values[check])
y_qmnist=y_qmnist.append(qmnist_y[70000:])
print(y_qmnist)

In [None]:
print(y_qmnist)

In [None]:
train=pd.concat([y_qmnist.iloc[:,0],X_qmnist],axis=1)

In [None]:
train

In [None]:
pd.DataFrame(train).to_csv('train.csv', index=False)

In [None]:
y_qmnist_test

In [None]:
pd.DataFrame(y_qmnist_test).iloc[:,0]

In [None]:
val=pd.concat([pd.DataFrame(y_qmnist_test).iloc[:,0],pd.DataFrame(qmnist_test)],axis=1)

In [None]:
pd.DataFrame(val).to_csv('val.csv', index=False)

In [None]:
X = np.array(X_qmnist, dtype="float32") / 255
print(X.shape)

In [None]:
X = X.reshape(-1, 28, 28, 1)
print(X.shape)


In [None]:
pd.read_csv("/kaggle/input/qmnist/val.csv")

In [None]:
y=y_qmnist.iloc[:,0]
print(y)

In [None]:
from keras.utils import np_utils
y2=np_utils.to_categorical(y)

# ***Defining the model and adding callback.***

In [None]:
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,f1_score
from keras.utils import np_utils
(train_images,train_labels),(test_images,test_labels) = mnist.load_data()
X_val=qmnist_test.reshape(-1,28,28,1)
y_val=y_qmnist_test[:,0]
y_val2=np_utils.to_categorical(y_val)

The callback will stop the training when there is no improvement in the
loss for 30 consecutive epochs.

The restore_best_weights=True will take the model back to its best fit. 

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=30, verbose=1, restore_best_weights=True)




In [None]:

models=[]
for i in range(10):
    model = tf.keras.models.Sequential([
        preprocessing.RandomTranslation(height_factor=0.05, width_factor=0.05, fill_mode='constant'),
        preprocessing.RandomRotation(factor=0.05, fill_mode='constant'),
        preprocessing.RandomZoom(height_factor=(-0.05,0.05), width_factor=(-0.05,0.05), fill_mode='constant'),   
      tf.keras.layers.Conv2D(48, (3,3), activation='relu', input_shape=(28, 28, 1)),
        tf.keras.layers.BatchNormalization(momentum=0.9, epsilon=1e-5, gamma_initializer="uniform"),  
        tf.keras.layers.Dropout(0.4),
    #   tf.keras.layers.MaxPooling2D(2,2),  
      tf.keras.layers.Conv2D(96, (3,3), activation='relu'),  
          tf.keras.layers.BatchNormalization(momentum=0.9, epsilon=1e-5, gamma_initializer="uniform"),
      tf.keras.layers.MaxPooling2D(2,2),  
          tf.keras.layers.Dropout(0.4),
      tf.keras.layers.Conv2D(192, (3,3), activation='relu'),
          tf.keras.layers.BatchNormalization(momentum=0.9, epsilon=1e-5, gamma_initializer="uniform"),
       tf.keras.layers.MaxPooling2D(2,2),  
              tf.keras.layers.Dropout(0.4),
      tf.keras.layers.Conv2D(384, (3,3), activation='relu'),  
          tf.keras.layers.BatchNormalization(momentum=0.9, epsilon=1e-5, gamma_initializer="uniform"),
               tf.keras.layers.Dropout(0.4),
    #  tf.keras.layers.MaxPooling2D(2,2),  
      tf.keras.layers.Flatten(),
        
     # tf.keras.layers.Dense(512), 
    #   tf.keras.layers.Dense(256),   
       tf.keras.layers.Dense(32),
         tf.keras.layers.Dropout(0.4),
      tf.keras.layers.Dense(10, activation='softmax')
    ])
    models.append(model)

In [None]:
historys=[]
y=np.array(y)

from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

i=0

for train, validation in kfold.split(X, y):

    models[i].compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
                  #metrics=['sparse_categorical_accuracy'])


    history = models[i].fit(X[train], 
                        y[train],
                        epochs=80,
                        batch_size=128,
                        validation_data=(X[validation],y[validation]),
                        verbose=1,
                        callbacks=[callback])
    historys.append(history)
    model_json = models[i].to_json()
    with open('./model'+str(i)+'.json', 'w') as file:
        file.write(model_json)
    # 保存训练的权重
    models[i].save_weights('./model'+str(i)+'.h5')

    #test_loss, test_acc = model.evaluate(test_images, test_labels)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score,  precision_score
styles=[':','-.','--','-',':','-.','--','-',':','-.','--','-',':','-.','--','-']
names = ['model'+str(i) for i in range(15)]

# PLOT ACCURACIES
plt.figure(figsize=(15,5))
for i in range(len(models)):
    plt.plot(historys[i].history['val_accuracy'],linestyle=styles[i])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(names, loc='upper left')
axes = plt.gca()
axes.set_ylim([0.98,1])
plt.show()


In [None]:
results = np.zeros( (X_val.shape[0],10) ) 
for i in range(len(models)):
    y_pred=models[i].predict(X_val)
    results = results + y_pred
    y_pred=y_pred.argmax(1)
    #n=numbers[i]
#     print(i,'正确率',1-len(n)/len(y_val),'错误次数',len(n))
    print('|',names[i],'|',np.sum(y_pred!=y_val),'|',models[i].count_params(),'|',round(accuracy_score(y_val, y_pred),6),'|',round(recall_score(y_val, y_pred,average='macro'),6),'|',round(precision_score(y_val, y_pred,average='macro'),6),'|',round(f1_score(y_val, y_pred, average='macro'),6),'|',)
results = np.argmax(results,axis = 1)
y_pred=results
print('|','summary','|',np.sum(y_pred!=y_val),'|',models[i].count_params(),'|',round(accuracy_score(y_val, y_pred),6),'|',round(recall_score(y_val, y_pred,average='macro'),6),'|',round(precision_score(y_val, y_pred,average='macro'),6),'|',round(f1_score(y_val, y_pred, average='macro'),6),'|',)


# **Prepare and send the submission to the output directory**

In [None]:
test_data = pd.read_csv('../input/digit-recognizer/test.csv')
test_data = np.array(test_data, dtype=np.float32)/255
test_data = test_data.reshape(-1,28,28,1)




In [None]:
# ENSEMBLE PREDICTIONS AND SUBMIT
results = np.zeros( (test_data.shape[0],10) ) 
for j in range(15):
    results = results + models[j].predict(test_data)
results = np.argmax(results,axis = 1)
results = pd.Series(results,name="Label")
submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),results],axis = 1)
submission.to_csv("submission.csv",index=False)

In [None]:
pd.read_csv('submission.csv')

# **This is to remove a submission from the output directory**

In [None]:
#os.remove("/kaggle/working/submission.csv")