-
Notifications
You must be signed in to change notification settings - Fork 1
/
helper.py
352 lines (241 loc) · 10.1 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
from os import remove
from os import listdir
from os.path import isfile
from os.path import join
from datetime import date
from datetime import datetime
import calendar
import pickle
import fitz
from fuzzywuzzy import fuzz
from re import match
from pandas import DataFrame
from random import randint
from PIL import Image
from cv2 import imread
from cv2 import imwrite
from cv2 import rectangle
from init import IMAGE_FOLDER
from init import IMAGE_MAX_SIZE
from init import PDF_FOLDER
from init import PDF_MAX_SIZE
from init import PREDICTIONS_FOLDER
def validate_form_data(form_data: dict) -> list:
unavailable_fields = list()
for field in form_data:
record = form_data.get(field)
if record in [str(), '', None]:
unavailable_fields.append(field)
return unavailable_fields
def get_end_of_month(date_input: date) -> datetime:
date_year, date_month = date_input.year, date_input.month
res = calendar.monthrange(date_year, date_month)
end_day = res[1]
return datetime(date_year, date_month, end_day)
def convert_and_resize(image: Image, instance_id: str, document_type: str) -> (str, str):
# Resize the image while maintaining the aspect ratio
image.thumbnail(IMAGE_MAX_SIZE)
# Out filename
output_filename = '{}_{}.png'.format(instance_id, document_type)
output_filepath = join(IMAGE_FOLDER, output_filename)
# Save the resized image
image.save(output_filepath)
return output_filename, output_filepath
def draw_bounding_box_on_image(image_path: str, image_name: str, bounding_boxes: list, model_type: str):
# Load the image
image = imread(image_path)
color = (randint(0, 255), randint(0, 255), randint(0, 255))
for bbox in bounding_boxes:
# Define the coordinates of the bounding box
x = bbox.get('x')
y = bbox.get('y')
w = bbox.get('w')
h = bbox.get('h')
# Draw the bounding box on the image
rectangle(image, (x, y), (x + w, y + h), color, 2)
# Save the image with the bounding box
output_filepath = join(PREDICTIONS_FOLDER, '{}_{}'.format(model_type, image_name))
imwrite(output_filepath, image)
return output_filepath
def write_dict_to_pickle(dictionary: dict, model_type: str, name: str) -> str:
file_path = join(PREDICTIONS_FOLDER, '{}_{}.pkl'.format(model_type, name))
with open(file_path, 'wb') as file:
pickle.dump(dictionary, file)
file.close()
return file_path
def read_pickle_file(file_path):
with open(file_path, 'rb') as file:
data = pickle.load(file)
file.close()
return data
def convert_coordinates(polygon_coordinates: list) -> list:
relative_coordinates = list()
for polygon_points in polygon_coordinates:
x_values = [point[0] for point in polygon_points]
y_values = [point[1] for point in polygon_points]
x_min = min(x_values)
y_min = min(y_values)
x_max = max(x_values)
y_max = max(y_values)
width = x_max - x_min
height = y_max - y_min
relative_coordinates.append({'x': int(x_min), 'y': int(y_min), 'w': int(width), 'h': int(height)})
return relative_coordinates
def read_ocr_results(ocr_results: list) -> dict:
bounding_boxes = [line[0] for line in ocr_results]
texts = [line[1][0] for line in ocr_results]
scores = [line[1][1] for line in ocr_results]
return {'bbox': bounding_boxes, 'text': texts, 'score': scores}
def cut_image(image_path: str, file_name: str):
# Open the image
image = Image.open(image_path)
# Resize the image while maintaining the aspect ratio
image.thumbnail(PDF_MAX_SIZE)
# Get the width and height of the image
width, height = image.size
# Calculate the coordinates for cutting the image into halves
top_half = (0, 0, width, height // 2)
bottom_half = (0, height // 2, width, height)
# Crop the image using the calculated coordinates
top_image = image.crop(top_half)
bottom_image = image.crop(bottom_half)
# Save the top and bottom halves as separate images
top_image_path = join(IMAGE_FOLDER, '{}_1.png'.format(file_name))
bottom_image_path = join(IMAGE_FOLDER, '{}_2.png'.format(file_name))
#
top_image.save(top_image_path)
bottom_image.save(bottom_image_path)
return top_image_path, bottom_image_path, '{}_1.png'.format(file_name), '{}_2.png'.format(file_name)
def save_pdf_file(instance_id: str, model_type: str, uploaded_file):
file_name = '{}_{}'.format(instance_id, model_type)
save_path = join(PDF_FOLDER, '{}.pdf'.format(file_name))
with open(save_path, "wb") as f:
f.write(uploaded_file.getbuffer())
f.close()
return save_path, file_name
def convert_from_pdf_to_png(path_to_pdf: str, file_name: str):
doc = fitz.open(path_to_pdf)
zoom = 5
mat = fitz.Matrix(zoom, zoom)
pages = [p for p in doc]
output_path = join(IMAGE_FOLDER, '{}.png'.format(file_name))
page = doc.load_page(0)
pix = page.get_pixmap(matrix=mat)
pix.save(output_path)
doc.close()
return pages, output_path
def get_row_average_mid_point(row: list) -> float:
y_1 = row[0][1]
y_4 = row[3][1]
mid_point_left = y_1 + ((y_4 - y_1) / 2)
y_2 = row[1][1]
y_3 = row[2][1]
mid_point_right = y_2 + ((y_3 - y_2) / 2)
return (mid_point_left + mid_point_right) / 2
def in_box_range(y_value: float, box: list) -> bool:
y_1 = box[0][1]
y_2 = box[1][1]
y_3 = box[2][1]
y_4 = box[3][1]
y_min = min(y_1, y_2)
y_max = max(y_3, y_4)
if y_min <= y_value <= y_max:
return True
return False
def restructured_detected_text(bounding_boxes: list, box_mid_points: list, text_detected) -> DataFrame:
row_items = list()
# Iterate over number of detected words
for row_index in range(len(bounding_boxes)):
same_row_items = list()
# Iterate over number of detected words setting ech word constant
for element_index in range(len(bounding_boxes)):
# Check if in same row
check = in_box_range(box_mid_points[row_index], bounding_boxes[element_index])
# When word is in the same row
if check:
same_row_items.append(element_index)
# Save row items
row_items.append(same_row_items)
row_items = [tuple(item) for item in row_items] # Convert inner lists to tuples
row_items = list(set(row_items))
row_items = sorted(row_items, key=lambda x: len(x), reverse=True)
row_list, enlisted = list(), list()
for tuple_item in row_items:
add_row = True
for element in tuple_item:
if element in enlisted:
add_row = False
if add_row:
row_list.append(tuple_item)
for element in tuple_item:
enlisted.append(element)
row_list = sorted(row_list, key=lambda x: x[0])
text_list = list()
for row_tuple in row_list:
words = [text_detected[row_item] for row_item in row_tuple]
text_list.append(words)
text_list_df = DataFrame(text_list)
return text_list_df
def get_national_id_detected_text(attribute: str, text_df: DataFrame) -> str | None:
result = text_df[text_df.iloc[:, 0] == attribute]
if not result.empty:
return result.iloc[0, 1]
return None
def get_national_id_detected_features(results: dict) -> list:
detected_classes = results.get('classes')
expected_classes = ['court-of-arms', 'fingerprint', 'seal', 'zimbabwe-bird']
detected_features = list()
for class_name in expected_classes:
# When class is detected
if class_name in detected_classes:
detected_features.append([class_name, '✅'])
# When class is not detected
if class_name not in detected_classes:
detected_features.append([class_name, '❌'])
return detected_features
def get_kyc_document_detected_text(search_term_list: list, detected_text_list: list, detected_bboxes_list: list):
por_detected_text_index_list = list()
# Iterate over Search List
for search_term in search_term_list:
# Iterate over detected list
for detected_text_index in range(len(detected_text_list)):
if search_term in detected_text_list[detected_text_index]:
por_detected_text_index_list.append(detected_text_index)
por_detected_bboxes_list = [detected_bboxes_list[index] for index in por_detected_text_index_list]
return por_detected_bboxes_list
def check_text_similarity(text_1: str, text_2: str, similarity_threshold: float = 70) -> str:
# Compare the texts
similarity_score = fuzz.ratio(text_1, text_2)
# Determine if the texts are similar or not based on the similarity score
if similarity_score >= similarity_threshold:
return '✅'
return '❌'
def convert_date_format(date_string: str, desired_format: str) -> str:
# Define the input and output format codes
input_format_codes = ['yyyy', 'mm', 'dd']
output_format_codes = ['%Y', '%m', '%d']
# Map the input format codes to the corresponding output format codes
format_mapping = dict(zip(input_format_codes, output_format_codes))
# Replace the format codes in the desired format with the corresponding output format codes
for code in input_format_codes:
desired_format = desired_format.replace(code, format_mapping[code])
# Convert the date string to the desired format
converted_date = datetime.strptime(date_string, '%Y-%m-%d').strftime(desired_format)
return converted_date
def get_list_unique_elements(elements_list: list) -> list:
unique_elements = list()
for element in elements_list:
if element not in unique_elements:
unique_elements.append(element)
return unique_elements
def delete_files_by_session_id(session_id: str, folder_path: str):
for filename in listdir(folder_path):
if session_id in filename:
file_path = join(folder_path, filename)
if isfile(file_path):
remove(file_path)
def validate_email(email_string: str) -> bool:
pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
if match(pattern, email_string):
return True
return False