# Libraries

In [1]:
%matplotlib inline

import cv2
import numpy as np
import glob
import os
import re
import shutil
import imageio

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.svm import SVC
from sklearn.externals import joblib

from pprint import pprint
from skimage import img_as_ubyte
from matplotlib import pyplot as plt
from matplotlib import image as mpimg
from google.colab.patches import cv2_imshow
from PIL import Image



In [2]:
!pip install opencv-contrib-python



# Image resizing

In [3]:
def printSigners(cleanSignsDirectory = "/content/sign_data_cleaned"):
    columns = 4
    rows = 8

    for type in ["test", "train"]:
        mainDirectory = cleanSignsDirectory + "/" + type
        signers = [f for f in os.listdir(mainDirectory) if re.search(r'[0-9]{3}$', f)]
        for signer in signers:
            i = 1
            fig = plt.figure(figsize=(35, 35))
            fig.suptitle('Signer ' + signer + ":", fontsize=40)

            for signatureType in ["forged", "original"]:
                currDir = mainDirectory + "/" + signer + "/" + signatureType
                for image in os.listdir(currDir):
                    imagePath = currDir + "/" + image
                    img = np.array(Image.open(imagePath))
                    fig.add_subplot(rows, columns, i, title = (signatureType + ":"))
                    plt.imshow(img)
                    i = i+1

            plt.subplots_adjust(wspace=0, hspace=0.2)    
            plt.show()

def normalizeImage(im, min_size = 256, threshold = 0):
    kernel = np.ones((2,2),np.uint8)
    loadedImage = Image.open(im).convert('LA')

    outputSize = 300, 300
    x, y = loadedImage.size
    size = max(min_size, x, y)
    newImage = Image.new('RGBA', (size, size), "white")
    newImage.paste(loadedImage, (int((size - x) / 2), int((size - y) / 2)))

    newImage.thumbnail(outputSize, Image.BILINEAR) # https://pillow.readthedocs.io/en/3.1.x/reference/Image.html#PIL.Image.Image.thumbnail

    # Convert to CV2 image
    cv2Image = np.array(newImage) 

    # Remove noise
    # https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_imgproc/py_morphological_ops/py_morphological_ops.html#opening
    cv2Image = cv2.morphologyEx(cv2Image, cv2.MORPH_OPEN, kernel)

    # Close strokes
    # https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_imgproc/py_morphological_ops/py_morphological_ops.html#closing
    cv2Image = cv2.morphologyEx(cv2Image, cv2.MORPH_CLOSE, kernel)

    # Median Filter
    # https://docs.opencv.org/2.4/doc/tutorials/imgproc/gausian_median_blur_bilateral_filter/gausian_median_blur_bilateral_filter.html
    # cv2Image = cv2.medianBlur(cv2Image, 5)

    # Perform binary thresholding on the image with T = 125
    # The concept of thresholding is quite simple. Ppixel values can be any
    # value between 0 to 255. Let's say we wish to convert an image into a
    # binary image i.e. assign a pixel either a value of 0 or 1. To do this, we
    # can perform thresholding. For instance, if the Threshold (T) value is 125,
    # then all pixels with values greater than 125 would be assigned a value 
    # of 1, and all pixels with values lesser than or equal to that would be
    # assigned a value of 0. 
    # https://stackabuse.com/introduction-to-image-processing-in-python-with-opencv/
    if (threshold != 0):
        r, cv2Image = cv2.threshold(cv2Image, threshold, 255, cv2.THRESH_BINARY)

    return cv2Image

# GIF from threshold

In [None]:
imagesForGif = []
imageGifPath = '/content/threshold.gif'
imageGifImagesPath = '/content/gif'
font = cv2.FONT_HERSHEY_SIMPLEX

if (os.path.isdir(imageGifImagesPath)):
    shutil.rmtree(imageGifImagesPath)

os.mkdir(imageGifImagesPath)

for threshold in range(120, 254):
    image = normalizeImage(im = "/content/saturdays-ai-datasets/sign_data/test/049/12_049.png", threshold = threshold)
    textThreshold = str(threshold).zfill(3)
    path = imageGifImagesPath + "/" + textThreshold + ".PNG"
    
    cv2.putText(image, textThreshold + "/254", (5,30), font, 1, (0,0,0), 2, cv2.LINE_AA)
    cv2.imwrite(path, image)
 
# Create the frames
frames = []
imgs = sorted(glob.glob(imageGifImagesPath + "/*.PNG"), key=os.path.realpath)
for i in imgs:
    new_frame = Image.open(i)
    frames.append(new_frame)
 
# Save into a GIF file that loops forever
frames[0].save(imageGifPath, format='GIF',
               append_images=frames[1:],
               save_all=True,
               duration=0.5, loop=0)

print("Your GIF image is on: " + imageGifPath)

Your GIF image is on: /content/threshold.gif


# KNN
https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9

- **Precission**: Precision talks about how precise/accurate your model is out of those predicted positive, how many of them are actual positive. Precision is a good measure to determine, when the costs of False Positive is high

- **Recall**: Recall actually calculates how many of the Actual Positives our model capture through labeling it as Positive (True Positive). We use Recall to select our best model when there is a high cost associated with False Negative. For instance, in fraud detection or sick patient detection. If a fraudulent transaction (Actual Positive) is predicted as non-fraudulent (Predicted Negative), the consequence can be very bad for the bank.

- **F1**: F1 Score might be a better measure to use if we need to seek a balance between Precision and Recall AND there is an uneven class distribution (large number of Actual Negatives).

En nuestro caso, buscamos evitar al máximo los falsos negativos, ya que significaría que hemos marcado como válida una firma falsa, sin opción de revisión manual por parte de un experto, por esa razón nos centramos en la métrica de **Recall** al ejecutar nuestro modelo KNN.


In [5]:
def knn(pathList):
    j=0
    hu_moments_train = []
    hu_moments_test = []
    labels_train = []
    labels_test = []
    kernel = np.ones((2,2),np.uint8)

    for filename in pathList:
        split = (filename[1:]).split("/")
        train = True if split[2] == "train" else False
        forged = True if split[4] == "forged" else False

        if train: # Train
            if forged:
                labels_train.append(1)
            else:
                labels_train.append(0)
            img = cv2.imread(filename ,cv2.IMREAD_GRAYSCALE)
            dilate = cv2.dilate(img, kernel, iterations=1)
            #print(cv2.HuMoments(cv2.moments(img)).flatten())
            hu_moments_train.append(cv2.HuMoments(cv2.moments(img)).flatten())

        else: # Test
            if forged:
                labels_test.append(1)
            else:
                labels_test.append(0)
            img = cv2.imread(filename,cv2.IMREAD_GRAYSCALE)
            dilate = cv2.dilate(img, kernel, iterations=1)
            #print(cv2.HuMoments(cv2.moments(img)).flatten())
            hu_moments_test.append(cv2.HuMoments(cv2.moments(img)).flatten())

    hu_moments_train = np.asarray(hu_moments_train)
    hu_moments_test = np.asarray(hu_moments_test)
    accuracy = []
    precission = []
    recall = []
    label = []
    k = [i for i in range(1, 20)]

    for i in k:
        neigh = KNeighborsClassifier(n_neighbors=i)
        neigh.fit(hu_moments_train, labels_train)

        predict = neigh.predict(hu_moments_test)

        accuracyScore = accuracy_score(predict,labels_test)
        accuracy.append(accuracyScore)

        precissionScore = precision_score(predict,labels_test)
        precission.append(precissionScore)

        recallScore = recall_score(predict,labels_test)
        recall.append(recallScore)

        label.append(i)

    return [
            np.max(recall),
            accuracy[recall.index(np.max(recall))],
            precission[recall.index(np.max(recall))],
            label[recall.index(np.max(recall))]
    ]

# Load images from Github

In [6]:
!git clone https://github.com/jgmullor/saturdays-ai-datasets

Cloning into 'saturdays-ai-datasets'...
remote: Enumerating objects: 118, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 4295 (delta 1), reused 117 (delta 1), pack-reused 4177[K
Receiving objects: 100% (4295/4295), 475.42 MiB | 15.42 MiB/s, done.
Resolving deltas: 100% (5/5), done.
Checking out files: 100% (6672/6672), done.


# Cleaning and resizing images

In [7]:
def cleanAndResizeImages(cleanSignsDirectory, threshold = 0):
    # Clean ouput directory from previous builds and mkdir again
    originalSignsPathList = []
    forgedSignsPathList = []

    if (os.path.isdir(cleanSignsDirectory)):
        shutil.rmtree(cleanSignsDirectory)

    os.mkdir(cleanSignsDirectory)

    # Clean test and train directories
    for dataset in ["sign_data", "cedar-signatures-dataset-organized"]:
        for dataType in ["test", "train"]:
            mainDirectory = "/content/saturdays-ai-datasets/" + dataset + "/" + dataType
            outputDirectory = cleanSignsDirectory + "/" + dataType
            if (not os.path.isdir(outputDirectory)):
                os.mkdir(outputDirectory)

            intSigner = 0
            signers = [f for f in os.listdir(mainDirectory) if re.search(r'[0-9]{1,3}$', f)]
            #signers = [f for f in os.listdir(mainDirectory)]
            for signer in signers:
                # print ("> Running in " + dataset + " (" + dataType + " - " + signer + ")")
                intSigner = intSigner + 1;
                # Create directories if required
                originalSignsDir = mainDirectory + "/" + signer
                originalSigns = os.listdir(originalSignsDir);

                forgedSignsDir = originalSignsDir + "_forg"
                forgedSigns = os.listdir(forgedSignsDir);

                signerDirectory = outputDirectory + "/" + str(intSigner)
                originalSignsTargetDir = signerDirectory + "/" + "original"
                forgedSignsTargetDir = signerDirectory + "/" + "forged"

                if (not os.path.isdir(signerDirectory)):
                    os.mkdir(signerDirectory)

                if (not os.path.isdir(originalSignsTargetDir)):
                    os.mkdir(originalSignsTargetDir)

                if (not os.path.isdir(forgedSignsTargetDir)):
                    os.mkdir(forgedSignsTargetDir)

                # Do the cleanup
                for originalSign in originalSigns:
                    imagePath = originalSignsDir + "/" + originalSign
                    imageSavePath = originalSignsTargetDir + "/" + originalSign

                    image = normalizeImage(imagePath, threshold=threshold)
                    cv2.imwrite(imageSavePath, image)
                    originalSignsPathList.append(imageSavePath)

                for forgedSign in forgedSigns:
                    imagePath = forgedSignsDir + "/" + forgedSign
                    imageSavePath = forgedSignsTargetDir + "/" + forgedSign

                    image = normalizeImage(imagePath, threshold=threshold)
                    cv2.imwrite(imageSavePath, image)
                    forgedSignsPathList.append(imageSavePath)
        
    return np.concatenate((originalSignsPathList, forgedSignsPathList), axis=None)

# Image cleaning

In [8]:
images = cleanAndResizeImages(cleanSignsDirectory = "/content/sign_data_cleaned")

#KNN

In [9]:
result = knn(images)
print (result)

[0.7138211382113822, 0.7577821011673151, 0.857421875, 8]


# Plot Signers signatures

In [None]:
printSigners(cleanSignsDirectory = "/content/sign_data_cleaned")

# Test multiple thresholds to look for best accuracy

In [None]:
print ("recall,accuracy,precission,neighbours")
for threshold in range(120, 254):
    images = cleanAndResizeImages(cleanSignsDirectory = "/content/sign_data_cleaned", threshold = threshold)
    result = [threshold] + knn(images)
    print (result)

recall,accuracy,precission,neighbours
[120, 0.5773584905660377, 0.541828793774319, 0.298828125, 6]
[121, 0.5653710247349824, 0.5379377431906615, 0.3125, 6]
[122, 0.5605095541401274, 0.5389105058365758, 0.34375, 12]
[123, 0.5571776155717761, 0.5476653696498055, 0.447265625, 1]
[124, 0.5826086956521739, 0.5389105058365758, 0.26171875, 2]
[125, 0.528344671201814, 0.5262645914396887, 0.455078125, 1]
[126, 0.5339366515837104, 0.5165369649805448, 0.23046875, 2]
[127, 0.5365853658536586, 0.5252918287937743, 0.34375, 32]
[128, 0.5170842824601367, 0.5165369649805448, 0.443359375, 1]
[129, 0.5225653206650831, 0.5204280155642024, 0.4296875, 11]
[130, 0.5393518518518519, 0.5350194552529183, 0.455078125, 9]
[131, 0.5668202764976958, 0.5301556420233463, 0.240234375, 2]
[132, 0.5155925155925156, 0.5165369649805448, 0.484375, 7]
[133, 0.5154929577464789, 0.5126459143968871, 0.357421875, 8]
[134, 0.49074074074074076, 0.4961089494163424, 0.310546875, 6]
[135, 0.4905263157894737, 0.49319066147859925, 0.4