In [73]:
import cv2
import numpy as np
import pylab as pl
from preprocess import *
from scipy.stats import linregress
from collections import Counter
import pandas as pd
import seaborn as sns
import joblib
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from classifiers_manager import *
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from joblib import parallel_backend
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearnex import patch_sklearn
patch_sklearn()


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# SIFT FEATURES

In [19]:
A = load_Dataset("../../fonts-dataset/Scheherazade New/*.jpeg")
B = load_Dataset("../../fonts-dataset/Lemonada/*.jpeg")
C = load_Dataset("../../fonts-dataset/Marhey/*.jpeg")
D = load_Dataset("../../fonts-dataset/IBM Plex Sans Arabic/*.jpeg")

1000
1000
1000
1000


In [20]:
A, A_TEST = train_test_split(A, test_size=0.2)
B, B_TEST = train_test_split(B, test_size=0.2)
C, C_TEST = train_test_split(C, test_size=0.2)
D, D_TEST = train_test_split(D, test_size=0.2)

In [21]:
A_PROCESSED = []
for img in A:
    img=denoise(img)
    words=get_lines(img)
    A_PROCESSED.extend(words)
print("A: ", len(A_PROCESSED))
B_PROCESSED = []
for img in B:
    img=denoise(img)
    words=get_lines(img)
    B_PROCESSED.extend(words)
print("B: ", len(B_PROCESSED))
C_PROCESSED = []
for img in C:
    img=denoise(img)
    words=get_lines(img)
    C_PROCESSED.extend(words)
print("C: ", len(C_PROCESSED))
D_PROCESSED = []
for img in D:
    img=denoise(img)
    words=get_lines(img)
    D_PROCESSED.extend(words)
print("D: ", len(D_PROCESSED))

A:  8000
B:  8000
C:  8000
D:  8000


In [22]:
A.clear()
B.clear()
C.clear()
D.clear()

In [23]:
A_TPROCESSED = []
for img in A_TEST:
    img=denoise(img)
    words=get_lines(img)
    A_TPROCESSED.append(words)
print("A: ", len(A_TPROCESSED))
B_TPROCESSED = []
for img in B_TEST:
    img=denoise(img)
    words=get_lines(img)
    B_TPROCESSED.append(words)
print("B: ", len(B_TPROCESSED))
C_TPROCESSED = []
for img in C_TEST:
    bimg=denoise(img)
    words=get_lines(img)
    C_TPROCESSED.append(words)
print("C: ", len(C_TPROCESSED))
D_TPROCESSED = []
for img in D_TEST:
    img=denoise(img)
    words=get_lines(img)
    D_TPROCESSED.append(words)
print("D: ", len(D_TPROCESSED))


A:  200
B:  200
C:  200
D:  200


In [24]:
A_TEST.clear()
B_TEST.clear()
C_TEST.clear()
D_TEST.clear()

In [25]:
def extract_sift_features(image):
    sift = cv2.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(image, None)
    return descriptors

In [26]:
def prepare_data(images):
    features = []
    for image in images:
        descriptors = extract_sift_features(image)
        if descriptors  is not None:
            features.append(descriptors)
    return features

In [27]:
def create_codebook(features, num_clusters, batch_size):
    # Create a MiniBatchKMeans clustering object
    kmeans = MiniBatchKMeans(n_clusters=num_clusters, batch_size=batch_size, random_state=0)

    # Fit the clustering model to the SIFT features
    kmeans.fit(features)

    return kmeans

In [28]:
def compute_bovw_representation(features, codebook):
    num_clusters = codebook.n_clusters
    bovw_representation = []

    for image_features in features:
        if len(image_features) > 0:
            # Assign each feature to a cluster
            image_features=image_features.reshape(-1,128)
            cluster_assignments = codebook.predict(image_features)

            # Create a histogram of cluster frequencies
            histogram = np.bincount(cluster_assignments, minlength=num_clusters)

            # Normalize the histogram
            histogram = histogram / np.sum(histogram)

            bovw_representation.append(histogram)
        else:
            # Handle cases where no features were detected
            bovw_representation.append(np.zeros(num_clusters))

    return bovw_representation

In [29]:
train_labels= np.array([0]*len(A_PROCESSED) + [1]*len(B_PROCESSED) + [2]*len(C_PROCESSED) + [3]*len(D_PROCESSED))

In [30]:
A_PROCESSED= prepare_data(A_PROCESSED)
B_PROCESSED= prepare_data(B_PROCESSED)
C_PROCESSED= prepare_data(C_PROCESSED)
D_PROCESSED= prepare_data(D_PROCESSED)

In [31]:
print("A: ", A_PROCESSED[10].shape)
print("B: ", B_PROCESSED[530].shape)
print("C: ", C_PROCESSED[320].shape)
print("D: ", D_PROCESSED[2034].shape)

A:  (149, 128)
B:  (256, 128)
C:  (429, 128)
D:  (442, 128)


In [32]:
combined_list = A_PROCESSED + B_PROCESSED + C_PROCESSED + D_PROCESSED
concatenated_train_sift_features = np.vstack(combined_list)
print(concatenated_train_sift_features.shape)

(8142807, 128)


In [33]:
print(len(combined_list[1][0]))

128


In [34]:
train_labels= np.array([0]*len(A_PROCESSED) + [1]*len(B_PROCESSED) + [2]*len(C_PROCESSED) + [3]*len(D_PROCESSED))

In [35]:
num_clusters=200
batch_size=1000
codebook = create_codebook(concatenated_train_sift_features, 200, 1000)
print(codebook)



MiniBatchKMeans(batch_size=1000, n_clusters=200, random_state=0)


In [36]:
joblib.dump(codebook, "codebook.pkl")

['codebook.pkl']

In [37]:
train_bovw_features = compute_bovw_representation(combined_list, codebook)

In [38]:
print(len(train_bovw_features))
print(len(train_labels))

23799
23799


In [39]:
A_PROCESSED.clear()
B_PROCESSED.clear()
C_PROCESSED.clear()
D_PROCESSED.clear()

In [40]:
train_bovw_features = np.array(train_bovw_features)
train_labels = np.array(train_labels)

## Model

In [76]:
classifiers_manager = ClassifiersManager()
classifiers_manager.train(Classifiers.svm,train_bovw_features,train_labels)



In [77]:
svm_model=None
with open('SVM.pkl', 'rb') as file:
    svm_model = pickle.load(file)

# Now svm_model contains the loaded SVM model
print(svm_model)

SVC(C=100, gamma=10, kernel='poly')


In [78]:
counts = 0 
i =0
for words in A_TPROCESSED:
    DESC= prepare_data(words)
    if(len(DESC)==0):
        continue
    bovw_features = compute_bovw_representation(DESC, codebook)
    bovw_features = np.array(bovw_features)
    # if len(DESC) == 1:
    #     DESC=DESC.reshape(1,-1)
    predictions = svm_model.predict(bovw_features)
    print(predictions)
    print("iteration: ",i)
    preds_count= Counter(predictions)
    most_common_label = preds_count.most_common(1)[0][0]
    print(most_common_label)
    if most_common_label == 0:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/len(A_TPROCESSED))


[2 0 0 0 0 0 2]
iteration:  0
0
1
[0 0 0 0 0]
iteration:  1
0
2
[0 0 0 0 2 0 0]
iteration:  2
0
3
[0 0 0 0 0 0 0 0 0 0]
iteration:  3
0
4
[0 0 0 0 0]
iteration:  4
0
5
[0 0 0 0 0]
iteration:  5
0
6
[0 0 0 0 3 0 0 0 0 0]
iteration:  6
0
7
[0 0 0 0 0 0 0 0 0 0]
iteration:  7
0
8
[0 0 0 0 0 0 0 0 0 0]
iteration:  8
0
9
[2 0 0 0 0 2 0 0 0 0]
iteration:  9
0
10
[0 0]
iteration:  10
0
11
[0 2 0 0 0]
iteration:  11
0
12
[0 0 0 0 0]
iteration:  12
0
13
[0 0 0 0 0]
iteration:  13
0
14
[0 0 0 0 0 0 0 0 0 0]
iteration:  14
0
15
[0 0 0 0 0 0 0 0 0 0]
iteration:  15
0
16
[0 0 0 0 0 0]
iteration:  16
0
17
[0 0 0 0 0]
iteration:  17
0
18
[0 0 0 0 0]
iteration:  18
0
19
[0 0 0 0 0 0 0 0 0 0]
iteration:  19
0
20
[0 0 0 0 0 0 0 0 0 0]
iteration:  20
0
21
[0 0 0 0 0 0 0 0 0 0]
iteration:  21
0
22
[0 0 0 0 0]
iteration:  22
0
23
[0 0 0 0 0]
iteration:  23
0
24
[0 0 0 0 0 0 0]
iteration:  24
0
25
[0 0 0 0 0 0 0 0 0 0]
iteration:  25
0
26
[0 0 0 0 0 0 0 0 0 0]
iteration:  26
0
27
[0 0 0 0 0 0 0 0 0]
iterati

In [79]:
i =0
for words in B_TPROCESSED:
    DESC= prepare_data(words)
    if(len(DESC)==0):
        continue
    bovw_features = compute_bovw_representation(DESC, codebook)
    bovw_features = np.array(bovw_features)
    # if len(DESC) == 1:
    #     DESC=DESC.reshape(1,-1)
    predictions = svm_model.predict(bovw_features)
    print(predictions)
    print("iteration: ",i)
    preds_count= Counter(predictions)
    most_common_label = preds_count.most_common(1)[0][0]
    print(most_common_label)
    if most_common_label == 1:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/(len(A_TPROCESSED)*2))


[3 1 1 1 1 1 1 1 1 1]
iteration:  0
1
197
[1 1 1 1 1 1 1 1 1]
iteration:  1
1
198
[1 1 1 1 1 2 1 1 1 1]
iteration:  2
1
199
[1 1 1 1 1]
iteration:  3
1
200
[1 1 1 1 1]
iteration:  4
1
201
[1 1 1 1 1]
iteration:  5
1
202
[1 1 1 1 1 1 1 1 1 1]
iteration:  6
1
203
[1 1 1 1 1]
iteration:  7
1
204
[1 1 1 1 1]
iteration:  8
1
205
[1 1 1 1 1]
iteration:  9
1
206
[1 1 1 1 1]
iteration:  10
1
207
[1 1 1 1 1 1 1 1 1 1]
iteration:  11
1
208
[3 1 1 1 1 3 1]
iteration:  12
1
209
[1 1 1 1 1 1 1 1 1 1]
iteration:  13
1
210
[1 1 1 1 1 1 1 1 1]
iteration:  14
1
211
[1 1 1 1 1 1 1 1 1 1]
iteration:  15
1
212
[1 1 1 1 1 1 1 1 1 1]
iteration:  16
1
213
[1 1 1 1 1 1 1 1 1 1]
iteration:  17
1
214
[1 1 1 1 1 1 1 1 1 1]
iteration:  18
1
215
[1 1 1 1 1 1 1 1 1 1]
iteration:  19
1
216
[1 1 1 1 1 1 1 1 1 1]
iteration:  20
1
217
[1 1 1 1 1 1 1 1 1]
iteration:  21
1
218
[1 1 1 1 1]
iteration:  22
1
219
[1 1 1 1 1 1 1 1 1 1]
iteration:  23
1
220
[1 1 1 1 1]
iteration:  24
1
221
[1 1 1 1 1 1 1 1 1 1]
iteration:  25


In [80]:
i =0
for words in C_TPROCESSED:
    DESC= prepare_data(words)
    if(len(DESC)==0):
        continue
    bovw_features = compute_bovw_representation(DESC, codebook)
    bovw_features = np.array(bovw_features)
    # if len(DESC) == 1:
    #     DESC=DESC.reshape(1,-1)
    predictions = svm_model.predict(bovw_features)
    print(predictions)
    print("iteration: ",i)
    preds_count= Counter(predictions)
    most_common_label = preds_count.most_common(1)[0][0]
    print(most_common_label)
    if most_common_label == 2:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/(len(A_TPROCESSED)*3))


[2 2 2 2 2]
iteration:  0
2
396
[2 2 2 2 2]
iteration:  1
2
397
[2 2 2 2 2 2 2 2 2 2]
iteration:  2
2
398
[2 2 2 2 2 2 2 2 2 2]
iteration:  3
2
399
[1 1 1 1 1 1 1 1 1 1]
iteration:  4
1
399
[2 2 2 2 2 2 2 2 2 2]
iteration:  5
2
400
[2 2 2 2 2 1 2 2 2 2]
iteration:  6
2
401
[2 2 2 2 2]
iteration:  7
2
402
[2 2 2 2 2 2 2 2 2 2]
iteration:  8
2
403
[2 2 2 2 2 2 2 2 2 2]
iteration:  9
2
404
[2 2 2 2 2]
iteration:  10
2
405
[2 2 2 2 2 0 2 2]
iteration:  11
2
406
[2 2 2 2 2 2 2 2 2 2]
iteration:  12
2
407
[2 2 2 2 2]
iteration:  13
2
408
[2 2 2 2 2]
iteration:  14
2
409
[2 2 2 2 2]
iteration:  15
2
410
[2 2 2 2 2]
iteration:  16
2
411
[0 2 2 2 2]
iteration:  17
2
412
[2 2 2 2 2]
iteration:  18
2
413
[2 2 2 2 2 2 2 2 2 2]
iteration:  19
2
414
[1 1 1 1 1 1 1 1 1 1]
iteration:  20
1
414
[2 2 2 2 2]
iteration:  21
2
415
[2 2 2 2 2 2 2 2 2 2]
iteration:  22
2
416
[2 2 2 2 2 2 3 2 2 3]
iteration:  23
2
417
[2 2 2 2 2]
iteration:  24
2
418
[3 2 2 2 2]
iteration:  25
2
419
[2 2 2 2 2 2 2 2 2 2]
iter

In [81]:
i =0
for words in D_TPROCESSED:
    DESC= prepare_data(words)
    if(len(DESC)==0):
        continue
    bovw_features = compute_bovw_representation(DESC, codebook)
    bovw_features = np.array(bovw_features)
    # if len(DESC) == 1:
    #     DESC=DESC.reshape(1,-1)
    predictions = svm_model.predict(bovw_features)
    print(predictions)
    print("iteration: ",i)
    preds_count= Counter(predictions)
    most_common_label = preds_count.most_common(1)[0][0]
    print(most_common_label)
    if most_common_label == 3:
        counts+=1
    print(counts)
    i+=1
print("A: ", counts*100/(len(A_TPROCESSED)*4))


[3 3 3 3 3 3 3 3 3 3]
iteration:  0
3
580
[2 3 3 3 1 3 3 3 3 3]
iteration:  1
3
581
[3 3 3 3 3 3 3 3 3]
iteration:  2
3
582
[3 3 3 3 3]
iteration:  3
3
583
[3 3 3 3 3 3 3 3 3 3]
iteration:  4
3
584
[3 3 3 3 3]
iteration:  5
3
585
[3 3 3 3 3 3 3 3 3 3]
iteration:  6
3
586
[3 3 3 3 3 3 3 3 3 3]
iteration:  7
3
587
[3 3 3 3 3]
iteration:  8
3
588
[3 3 3 3 3]
iteration:  9
3
589
[3 3 3 3 3 3 3 3 3 3]
iteration:  10
3
590
[3 3 3 3 3 3 3]
iteration:  11
3
591
[3 3 3 3 3]
iteration:  12
3
592
[3 3 3 3 3 3]
iteration:  13
3
593
[3 3 2]
iteration:  14
3
594
[3 3 3 3 3]
iteration:  15
3
595
[3 3 3 3 3 3 3 3 3 3]
iteration:  16
3
596
[3 3 3 3 3]
iteration:  17
3
597
[3 3 3 3 3]
iteration:  18
3
598
[3 3 3 3 3]
iteration:  19
3
599
[3 3 3 3 3 3 3 3 3 3]
iteration:  20
3
600
[3 3 3 3 3 3 3 3 3 3]
iteration:  21
3
601
[3 3 3 3 3 3 3 3 3 3]
iteration:  22
3
602
[3 3 3 3 3 3 3 3 3 3]
iteration:  23
3
603
[3 3 3 3 3 3 3 3 3 3]
iteration:  24
3
604
[3 2 3 3 3]
iteration:  25
3
605
[3 3 3 3 3 1 3 3 1]
it