In [1]:
#import cv2
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pylab as pl
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import MiniBatchKMeans
from preprocess import *
from collections import Counter
import joblib
import cv2
# for faster execution using gpu
import cupy as cp
import numpy as np
import multiprocessing
%load_ext autoreload
%autoreload 2

In [2]:
A = load_Dataset("../fonts-dataset/Scheherazade New/*.jpeg")
B= load_Dataset("../fonts-dataset/Lemonada/*.jpeg")
C= load_Dataset("../fonts-dataset/Marhey/*.jpeg")
D= load_Dataset("../fonts-dataset/IBM Plex Sans Arabic/*.jpeg")

832
832
816
816


In [3]:
A, A_TEST = train_test_split(A, test_size=0.2)
B, B_TEST = train_test_split(B, test_size=0.2)
C, C_TEST = train_test_split(C, test_size=0.2)
D, D_TEST = train_test_split(D, test_size=0.2)

In [4]:
A_PROCESSED = []
for img in A:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    A_PROCESSED.extend(words)
print("A: ", len(A_PROCESSED))
B_PROCESSED = []
for img in B:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    B_PROCESSED.extend(words)
print("B: ", len(B_PROCESSED))
C_PROCESSED = []
for img in C:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    C_PROCESSED.extend(words)
print("C: ", len(C_PROCESSED))
D_PROCESSED = []
for img in D:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    D_PROCESSED.extend(words)
print("D: ", len(D_PROCESSED))

A:  41086
B:  53558
C:  41762
D:  46844


In [5]:
def lbp_calculated_pixel(img, x, y):
    center = img[x, y]
    top_left = img[x-1, y-1]
    top = img[x-1, y]
    top_right = img[x-1, y+1]
    right = img[x, y+1]
    bottom_right = img[x+1, y+1]
    bottom = img[x+1, y]
    bottom_left = img[x+1, y-1]
    left = img[x, y-1]
    # get lbp values
    binary_values = np.array([top_left, top, top_right, right, bottom_right, bottom, bottom_left, left]) >= center
    binary_values =  binary_values.reshape((8, -1)).T.astype(int)
    powers_of_two = 2 ** np.arange(8)
    powers_of_two = powers_of_two[::-1]
    values=binary_values * powers_of_two
    values=np.sum(values, axis=1)
    # normalize the values
    values=values/np.sum(values)
   # print(values)
    return values

In [6]:
def lbp_feature_vector(img):
    img=np.array(img)
    height, width = img.shape
    indices_x, indices_y = np.meshgrid(np.arange(1, height - 1), np.arange(1, width - 1), indexing='ij')
    feature_vector = lbp_calculated_pixel(img, indices_x, indices_y)
    return feature_vector.flatten()

In [7]:
def prepare_data(images):
    data = []
    for img in images:
        data.append(lbp_feature_vector(img))
    return data

In [8]:
train_labels= np.array([0]*len(A_PROCESSED) + [1]*len(B_PROCESSED) + [2]*len(C_PROCESSED) + [3]*len(D_PROCESSED))

In [9]:
A_PROCESSED= prepare_data(A_PROCESSED)
B_PROCESSED= prepare_data(B_PROCESSED)
C_PROCESSED= prepare_data(C_PROCESSED)
D_PROCESSED= prepare_data(D_PROCESSED)

In [10]:
print("A: ", A_PROCESSED[100].shape)
print("B: ", B_PROCESSED[5380].shape)
print("C: ", C_PROCESSED[30].shape)
print("D: ", D_PROCESSED[204].shape)

A:  (3844,)
B:  (3844,)
C:  (3844,)
D:  (3844,)


In [11]:
combined_list = A_PROCESSED + B_PROCESSED + C_PROCESSED + D_PROCESSED
combined_list=np.array(combined_list)

In [12]:
A_PROCESSED=[]
B_PROCESSED=[]
C_PROCESSED=[]
D_PROCESSED=[]

In [13]:
print(len(combined_list))

183250


In [14]:
knn_classifier = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', n_jobs=-1)
knn_classifier.fit(combined_list, train_labels)

In [15]:
# joblib.dump(knn_classifier, 'knn_classifier_model.joblib')

In [16]:
A_TPROCESSED = []
for img in A_TEST:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    A_TPROCESSED.append(words)
print("A: ", len(A_TPROCESSED))
B_TPROCESSED = []
for img in B_TEST:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    B_TPROCESSED.append(words)
print("B: ", len(B_TPROCESSED))
C_TPROCESSED = []
for img in C_TEST:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    C_TPROCESSED.append(words)
print("C: ", len(C_TPROCESSED))
D_TPROCESSED = []
for img in D_TEST:
    binr=threshold_image(img)
    binr=assure_white_bg(binr)
    img=denoise(img)
    words=get_words_grey(binr,img, True)
    D_TPROCESSED.append(words)
print("D: ", len(D_TPROCESSED))


A:  167
B:  167
C:  164
D:  164


In [17]:
counts = 0 
i =0
for words in A_TPROCESSED:
    DESC= prepare_data(words)
    DESC=np.array(DESC)
    if(len(DESC)==0):
        continue
    if len(DESC.shape) == 1:
        DESC=DESC.reshape(1,-1)
    preds= knn_classifier.predict(DESC)
    print("iteration: ",i)
    print(preds)
    preds_count= Counter(preds)
    most_common_label = preds_count.most_common(1)[0][0]
    if most_common_label == 0:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/len(A_TPROCESSED))
    

iteration:  0
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]
1
iteration:  1
[0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 3 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0
 0 0 2 2 0 0 0 3 0 0 0 0 0 0 0 0 0 3 0 2 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0]
2
iteration:  2
[0 0 0 0 0 0 3 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
3
iteration:  3
[0 0 0 0 0 0 0 0 0 0 3 3 0 0 3 0 0 0 0 3 0 0 0 0 0 2 1 0 0 2 0 3 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 3 3 0 0 0 0 0 0 0 3 3 0 0 3 0 0 0 0 0 0 3 0 0 3 0 0 0 0
 0 3 0 0 3 0]
4
iteration:  4
[0 0 3 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0
 0 0 0 0 0 0 0 0 3 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 3 0 3 0 0 0 3 0 0 0 0 1 0
 0 0 3 0 0 0 0 3 0 3 0 3 0 0 0 0]
5
iteration:  5
[0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0
 0 0]
6
iteration:  6
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [18]:
i =0
for words in B_TPROCESSED:
    DESC= prepare_data(words)
    DESC=np.array(DESC)
    if(len(DESC)==0):
        continue
    if len(DESC.shape) == 1:
        DESC=DESC.reshape(1,-1)
    preds= knn_classifier.predict(DESC)
    print("iteration: ",i)
    print(preds)
    preds_count= Counter(preds)
    most_common_label = preds_count.most_common(1)[0][0]
    if most_common_label == 1:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/len(2*A_TPROCESSED))
    

iteration:  0
[1 3 1 0 1 1 0 3 3 1 1 1 1 1 0 0 0 3 0 1 1 2 0 1 1 1 0 3 0 1 0 1 1 0 1 1 1
 1 0 1 1 1 3 2 0 3 0 1 1 1 1]
167
iteration:  1
[1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 0 0 1
 1 1 3 1 1 1 3 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
168
iteration:  2
[1 1 1 1 1 0 1 1 1 0 1 0 0 0 3 0 1 3 2 1 1 1 2 0 0 1 1 1 0 0 3 1 1 0 1 0 0
 0 1 1 0 0 1 2 1 0 0 0 0 0 1 0 0 2 1 0 1 0 0 0 0 1 1 1 0 0 1 0 0 1 3 0 0 0
 1 1 3 0 3 0 2 0 3 0 0 0 1 0 0]
168
iteration:  3
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 0 1 1 1 3 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 3 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
169
iteration:  4
[1 3 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 3 1 1 1 0 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1]
170
iteration:  5
[1 1 1 1 1 1 1 1 1 1 2 1 3 1 1 0 1 0 1 1 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0
 3 1 1 1]
171
itera

In [19]:
i =0
for words in C_TPROCESSED:
    DESC= prepare_data(words)
    DESC=np.array(DESC)
    if(len(DESC)==0):
        continue
    if len(DESC.shape) == 1:
        DESC=DESC.reshape(1,-1)
    preds= knn_classifier.predict(DESC)
    print("iteration: ",i)
    print(preds)
    preds_count= Counter(preds)
    most_common_label = preds_count.most_common(1)[0][0]
    if most_common_label == 2:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/len(3*A_TPROCESSED))
    

iteration:  0
[0 2 2 2 2 2 1 0 0 2 2 3 0 2 0]
299
iteration:  1
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2
 3 2 2 2 2 3 2 2 3 2 2 0 3 2 2 2 2 2 3 2 2 2 2]
300
iteration:  2
[0 2 0 2 0 3 3 2 0 0 0 2 0 0 1 0 0]
300
iteration:  3
[0 0 2 2 2 3 0 3 0 2 3 3]
300
iteration:  4
[2 2 2 2 3 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 2 3 2 2 2 0 2 2 2 2 0 2 2 2 2
 2 0 2 0 2 2 2 2 0 0 2 2 0 2 2 2 2 3 2 0 2 3 0 2 2 2 2 3 2 2 2 2 2 2 1 0 0
 3 2 2 2 3 2 2 2 3 2 2 3 0 0 2]
301
iteration:  5
[2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 0 2 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2
 2]
302
iteration:  6
[0 2 0 2 2 2 2 3 2 2 2 2 3 0 0 2 2 2 2 2 2 2 2 2 0 2 2 3 2 2 2 3 2 2 2 2 2
 2 2 0 2 3 2 2 2 2 2 2 3 2 2 2 2 2 0 2 2 3 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 2
 2 2 2 2 2 3 2 2 2 3 2 2 3 2 2 0 0 0 2 2 2 0 2 2 2 3 0 2 3 2 2 2 2 2 0 2]
303
iteration:  7
[2 0 0 2 2 3 0 2 2 2 2 3 3 2 2 2 2 0 2 3 3 2 2 2 2 0 2 2 2 0 2 0 0 2 2 3 1
 2 3 2 2 2 2 3 2 2 2 2]
304
iteration:  8
[2 2 2 3 2 2 2 2 2 2 0 2 2 2 3 2 2 2 2 2 

In [20]:
i =0
for words in D_TPROCESSED:
    DESC= prepare_data(words)
    DESC=np.array(DESC)
    if(len(DESC)==0):
        continue
    if len(DESC.shape) == 1:
        DESC=DESC.reshape(1,-1)
    preds= knn_classifier.predict(DESC)
    print("iteration: ",i)
    print(preds)
    preds_count= Counter(preds)
    most_common_label = preds_count.most_common(1)[0][0]
    if most_common_label == 3:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/len(4*A_TPROCESSED))
    

iteration:  0
[3 3 3 3 3 1 3 3 3]
403
iteration:  1
[2 1 1 3 3 0 0 0 3 2 0 3 3 0 0 0]
403
iteration:  2
[3 3 3 3 3 3 0 0 3 3 3 3 3 0 3 3 3 0 0 3 0 3 3 3 3 0 3 3 3 0 3 0 3 3 3 0 3
 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 0 0 3 3 3 3 0 0 0 0 3 3 3 3 0
 3 3 0 3 3 3 3 3 3 3 3 3 0 0 0 3 3 3 3]
404
iteration:  3
[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3]
405
iteration:  4
[3 3 3 3 3 3 3 0 1 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3]
406
iteration:  5
[3 3 3 3 3 3 3 1 3 3 3 0 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 0 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 0 3 0 3 3 0 3 3 3 3 3 3 0 3 3 3 2 0 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3
 0 3 3 3 0]
407
iteration:  6
[0 3 0 3 3 3 3 3 3 3 3 3 3 3 3 0 3 0 3 3 3 3 3 3 3 2 3 0 3 3 3 3 3 3 3 3 3
 3 0 3 3 3 0 3 1 3 3 3 3 3 3 3 3 3 3 3]
408
iteration:  7
[3 0 0 3 3 3 1 3 3 3 3 3 3 0 3 3 3 3 0 3 3 3 0 0 3 3 0 3 3 3 0 3 0 2 0 3 3
 3 0 3 0 3