In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import os
import cv2
from PIL import Image as ImagePIL

from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Dropout, BatchNormalization, Input
from tensorflow.keras.callbacks import EarlyStopping

%matplotlib inline

In [None]:
def preprocess(array):
    """
    Normalizes the supplied array and reshapes it into the appropriate format.
    """

    array = array.astype("float32") / 255.0
    #array = np.reshape(array, (len(array), 28, 28, 1))
    return array


def noise(array):
    """
    Adds random noise to each image in the supplied array.
    """

    noise_factor = 0.4
    noisy_array = array + noise_factor * np.random.normal(
        loc=0.0, scale=1.0, size=array.shape
    )

    return np.clip(noisy_array, 0.0, 1.0)


def display(array1, array2):
    """
    Displays ten random images from each one of the supplied arrays.
    """

    n = 10

    indices = np.random.randint(len(array1), size=n)
    images1 = array1[indices, :]
    images2 = array2[indices, :]

    plt.figure(figsize=(20, 4))
    for i, (image1, image2) in enumerate(zip(images1, images2)):
        ax = plt.subplot(2, n, i + 1)
        plt.imshow(image1)
        # plt.imshow(image1.reshape(28, 28))
        plt.gray()
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

        ax = plt.subplot(2, n, i + 1 + n)
        plt.imshow(image2)
        # plt.imshow(image2.reshape(28, 28))
        plt.gray()
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

    plt.show()

In [None]:
# Special need for google colab
from google.colab import drive
drive.mount('/content/drive')


In [None]:
#Do this one only when we need clean images
def process_image(path):
    img = cv2.imread(path)

    #Increase contrast

    lab= cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l_channel, a, b = cv2.split(lab)

    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    cl = clahe.apply(l_channel)

    limg = cv2.merge((cl,a,b))

    enhanced_img = cv2.cvtColor(limg, cv2.COLOR_LAB2BGR)

    #Opencv denoise

    dn = cv2.fastNlMeansDenoisingColored(enhanced_img,None,10,10,7,21)

    cv2.imwrite('Clean_'+f, dn)

    a = 1

    return a 

In [None]:
# Generate clean images
for f in train_img:
    process_image( '/content/drive/MyDrive/OCR_summer/OCR_Dataset/'+ f)

In [None]:
#Kaggle trainning set
os.chdir("/content/drive/MyDrive/OCR_summer/OCR_Dataset")

train_dir = "/content/drive/MyDrive/OCR_summer/train"
Clean_train_dir = "/content/drive/MyDrive/OCR_summer/train_cleaned"
test_img_dir = "/content/drive/MyDrive/OCR_summer/OCR_Dataset/train_demo"


train_img = sorted(os.listdir(train_dir))
Clean_train_img = sorted(os.listdir(Clean_train_dir))
test_img =sorted(os.listdir(test_img_dir))

In [None]:
#OCR training set
train_dir = "/content/drive/MyDrive/OCR_summer/OCR_Dataset/train"
Clean_train_dir = "/content/drive/MyDrive/OCR_summer/OCR_Dataset/Clean_train"
test_img_dir = "/content/drive/MyDrive/OCR_summer/OCR_Dataset/train_demo"

train_img = sorted(os.listdir(train_dir))
Clean_train_img = sorted(os.listdir(Clean_train_dir))
test_img =sorted(os.listdir(test_img_dir))

In [None]:
IMG_WIDTH = 540
IMG_HEIGHT = 420

# prepare function
def sizedown_image(path):
    img = cv2.imread(path)
    img = np.asarray(img, dtype="float32")
    img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img/255.0
    img = np.reshape(img, (IMG_HEIGHT, IMG_WIDTH, 1))
    
    return img

In [None]:
train = []
train_cleaned = []
test = []

i = 0

for f in train_img:
    if i <= 75:
       i = i+1
       train.append(sizedown_image( train_dir + '/'+ f))

i = 0

for f in Clean_train_img:
    if i <= 75:
      i=i+1
      train_cleaned.append(sizedown_image(Clean_train_dir+'/' + f))

i = 0

for f in test_img:
    if i <= 75: 
      i=i+1
      test.append(sizedown_image(test_img_dir + '/' + f))

X_train = np.asarray(train)
Y_train = np.asarray(train_cleaned)
test = np.asarray(test)

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1)

In [None]:
#Websita encoder Part1

from tensorflow.keras import layers
from tensorflow.keras.models import Model

IMG_WIDTH = 540
IMG_HEIGHT = 420

input = layers.Input(shape=T(IMG_HEIGH, IMG_WIDTH, 1))

# Encoder
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(input)
x = layers.MaxPooling2D((2, 2), padding="same")(x)
x = layers.Conv2D(16, (3, 3), activation="relu", padding="same")(x)
x = layers.MaxPooling2D((2, 2), padding="same")(x)

# Decoder
x = layers.Conv2DTranspose(16, (3, 3), strides=2, activation="relu", padding="same")(x)
x = layers.Conv2DTranspose(32, (3, 3), strides=2, activation="relu", padding="same")(x)
x = layers.Conv2D(1, (3, 3), activation="sigmoid", padding="same")(x)

# Autoencoder
autoencoder = Model(input, x)
autoencoder.compile(optimizer="adam", loss="binary_crossentropy")
autoencoder.summary()

In [None]:
#Website encoder Part2
callback = EarlyStopping(monitor='loss', patience=10)
history = autoencoder.fit(
    x=X_train,
    y=Y_train,
    epochs=100,
    batch_size=128,
    shuffle=True,
    validation_data=(X_val, Y_val),
    callbacks=[callback]
)

In [None]:
# Restore the weights
model.load_weights('./model4/epochs80')

In [None]:
#Check model outcome
epoch_loss = history.history['loss']
epoch_val_loss = history.history['val_loss']
# epoch_mae = history.history['mae']
# epoch_val_mae = history.history['val_mae']

plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
plt.plot(range(0,len(epoch_loss)), epoch_loss, 'b-', linewidth=2, label='Train Loss')
plt.plot(range(0,len(epoch_val_loss)), epoch_val_loss, 'r-', linewidth=2, label='Val Loss')
plt.title('Evolution of loss on train & validation datasets over epochs')
plt.legend(loc='best')

plt.subplot(1,2,2)
plt.plot(range(0,len(epoch_mae)), epoch_mae, 'b-', linewidth=2, label='Train MAE')
plt.plot(range(0,len(epoch_val_mae)), epoch_val_mae, 'r-', linewidth=2,label='Val MAE')
plt.title('Evolution of MAE on train & validation datasets over epochs')
plt.legend(loc='best')

plt.show()

In [None]:
#Save the model
os.chdir('/content/drive/MyDrive/OCR_summer/OCR_Dataset') 
autoencoder.save_weights('./model5/epochs50')

In [None]:
#Process single image under model
figure = []
figure.append(sizedown_image('/content/drive/MyDrive/OCR_summer/OCR_Dataset/train/IMG_0845.JPG'))


figure = np.asarray(figure)

abc = autoencoder.predict(figure)

os.chdir('/content/drive/MyDrive/OCR_summer/OCR_Dataset') 
cv2.imwrite('web_Cleaned_IMG_0845.JPG',abc[0]*255)