In [3]:
import numpy as np
import cv2 as cv
import os
from sklearn.neighbors import KNeighborsClassifier
from skimage.feature import hog
import matplotlib.pyplot as plt
import re

In [4]:
treinamentoDiretorio = "CAPTCHA-10k/treinamento"
labelsDiretorio = "CAPTCHA-10k/labels10k/"
validacaoDiretorio = "CAPTCHA-10k/validacao"

In [265]:
regioes = [35, 65, 95, 125, 155]
tamanhoDaRegiao = 2048

In [266]:
def padding_img(img: np.ndarray, multiply_of: int):
    nLines, nCols = img.shape
    
    pad_row = 0 if nLines % multiply_of == 0 else multiply_of - nLines % multiply_of
    pad_col = 0 if nCols % multiply_of == 0 else multiply_of - nCols % multiply_of

    pad_width = ((0, pad_row), (0, pad_col)) # 
    return np.pad(img, pad_width, mode='maximum') 

In [267]:
def returna_regioes(img, regioes):
    img_regioes = []

    regioes = [5] + regioes + [img.shape[1]]

    for i in range(len(regioes)-1):
        img_regioes.append(img[:, regioes[i]:regioes[i+1]])


    return img_regioes


In [268]:
def hogTreinamento():
    features = []
    
    files = os.listdir(treinamentoDiretorio)
    
    for file in files:
        filePath = os.path.join(treinamentoDiretorio, file)
        img = cv.imread(filePath, cv.IMREAD_GRAYSCALE)

        imgs = returna_regioes(img, regioes)

        for i in imgs:
            i = padding_img(i, 16)
            if i.size != tamanhoDaRegiao:
                print("Erro todas as regioes precisam ter o mesmo tamanho")
        
        for i in imgs:
            features.append(hog(i, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys',feature_vector=True, channel_axis=None))

    return features



In [269]:
features = hogTreinamento()
features = np.asarray(features)

In [270]:

# Retorna as Labels de Treino
def getTrainingLabels():
    labelsArr = []
    files = os.listdir(treinamentoDiretorio)

    for file in files:
        # get the img label
        labelPATH = os.path.join(labelsDiretorio, file[:-4]+'.txt')
        labelFILE = open(labelPATH, 'r')
        label = labelFILE.read()
        labelFILE.close()

        label = label[0:6] if len(label) > 6 else label

        labelsArr += list(label)
    
    return np.asarray(labelsArr)


In [271]:
labelsArr = getTrainingLabels()

In [272]:
neigh = KNeighborsClassifier(n_neighbors=25)
neigh.fit(features, labelsArr)

# Validação


In [273]:
def predictCaptha(model, imgPath):
    img = cv.imread(imgPath, cv.IMREAD_GRAYSCALE)
    imgs = returna_regioes(img, regioes)


    features = []
    for i in imgs:
        features.append(hog(i, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys',feature_vector=True, channel_axis=None))
    features = np.asarray(features)
    return model.predict(features)
    


In [275]:
def validation(model):
    features = []
    files = os.listdir(validacaoDiretorio)
    
    for file in files:
        filePath = os.path.join(validacaoDiretorio, file)
        img = cv.imread(filePath, cv.IMREAD_GRAYSCALE)

        imgs = returna_regioes(img, regioes)

        for i in imgs:
            i = padding_img(i, 16)
            if i.size != tamanhoDaRegiao:
                print("Erro todas as regioes precisam ter o mesmo tamanho")
        
        for i in imgs:
            features.append(hog(i, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys',feature_vector=True, channel_axis=None))

    features = np.asarray(features)

    labelsArr = []
    files = os.listdir(validacaoDiretorio)

    for file in files:
        # get the img label
        labelPATH = os.path.join(labelsDiretorio, file[:-4]+'.txt')
        labelFILE = open(labelPATH, 'r')
        label = labelFILE.read()
        labelFILE.close()

        label = label[0:6] if len(label) > 6 else label

        labelsArr += list(label)

    predict = model.predict(features)

    return (predict, labelsArr)

predict, labels = validation(neigh)

