In [5]:
from PIL import Image, ImageEnhance
import cv2
import numpy as np
from skimage import io
from skimage.transform import rotate
from skimage.color import rgb2gray
from deskew import determine_skew
from matplotlib import pyplot as plt
from pdf2image import convert_from_path
import shutil
import glob
import os

def set_dpi(image, preproc_image):
    im = Image.open(image)
    im.save(preproc_image, dpi=(600,600))
    return preproc_image

def increase_contrast(image, preproc_image):
    im = Image.open(image)
    enhancer = ImageEnhance.Contrast(im)
    factor = 2 #increase contrast
    im_output = enhancer.enhance(factor)
    im_output.save(preproc_image)
    return preproc_image

def increase_sharpness(image, preproc_image):
    im = Image.open(image)
    enhancer = ImageEnhance.Sharpness(im)
    factor = 2
    im_output = enhancer.enhance(factor)
    im_output.save(preproc_image)
    return preproc_image

def convert_to_grayscale(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def convert_to_rgb(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def otsu_threshold(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    # im = cv2.threshold(im, 120, 255, cv2.THRESH_BINARY, cv2.THRESH_OTSU)[1]
    im = cv2.adaptiveThreshold(im,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def remove_noise(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.fastNlMeansDenoising(im, None, 20, 7, 21)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def deskew(image):
    im = io.imread(image)
    angle = determine_skew(im)
    rotated = rotate(im, angle, resize=True) * 255
    io.imsave(image, rotated.astype(np.uint8))
    return image

def pdf_to_images(pdf, filename):
    pdf = pdf.replace('\\', '\\\\')
    # print(pdf)
    images = convert_from_path(pdf)
    for i in range(len(images)):
        images[i].save('images2\\{}_page{}'.format(filename, i) +'.jpg', 'JPEG')

combinations = ['set_dpi-convert_to_rgb']
# combinations = ['set_dpi-increase_contrast-increase_sharpness']

for image in glob.glob('hotel_stay_images4\\*'):
    # filename = os.path.split(image)[-1].split('.jpg')[0]
    # print(filename)
    # preproc_image = 'preprocessed_ensemble\\'+filename+'.jpg'
    # shutil.copy(image, preproc_image)
    for combination in combinations:
        filename = os.path.split(image)[-1].split('.jpg')[0]
        preproc_image = 'hotel_stay_preprocessed4\\'+filename+'.jpg'
        shutil.copy(image, preproc_image)
        methods = combination.split('-')
        for method in methods:
            if 'deskew' in method:
                preproc_image = eval(method+'(preproc_image)')
            else:
                preproc_image = eval(method+'(preproc_image, preproc_image)')
        # print(combination)
        image_name = 'hotel_stay_preprocessed4\\'+filename+'_'+combination+'.jpg'
        img = cv2.imread(preproc_image)
        cv2.imwrite(image_name, img)

        psm_values = [3, 4]

        for psm in psm_values:

            parsed_output = 'hotel_stay_preprocessed4\\'+filename+'_'+combination+'_'+str(psm)
            command = 'tesseract {} {} -l eng --psm {}'.format(preproc_image, parsed_output, psm)
            os.system(command)

In [24]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import glob
from prettyprinter import pprint
import os
import re

# file = "C:\\Users\\Lenovo\\Desktop\\Oaknorth\\working\\xebia_1_page_3.jpg"
# k = 1
# for file in glob.glob('C:\\Users\\Lenovo\\Desktop\\delimiter_with_box\\bank_statements\\*'):
#     print(file)



# filename = img_name+'_delimiters.jpg'
file = 'C:\\air_ticket\\preprocessed2\\Yatra_AAAIN233681309_05.12.2022_page0.jpg'
# file = 'C:\\Users\\Lenovo\\Desktop\\Oaknorth\\working\\amplexor_1_page_5.jpg'

img_name = os.path.split(file)[-1].split('.')[0]

table_image_contour = cv2.imread(file, 0)
table_image = cv2.imread(file)

ret, thresh_value = cv2.threshold(
    table_image_contour, 180, 255, cv2.THRESH_BINARY_INV)

kernel = np.ones((5,5),np.uint8)
dilated_value = cv2.dilate(thresh_value,kernel,iterations = 1)

contours, hierarchy = cv2.findContours(
    dilated_value, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

coords = []
new_contours = []
# max_height = -1

for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)
    # print(w*h)
    # bounding the images
    # if h > max_height:
    #     max_height = h
    if w*h > 150:
        new_contours.append(cnt)
        coords.append([x,y,x+w,y+h])
        table_image = cv2.rectangle(table_image, (x, y), (x + w, y + h), (0, 0, 0), 1)
cv2.imwrite('samples\\{}.jpg'.format(img_name), table_image)
# k += 1
# print(k)

# Calculate maximum rectangle height
c = np.array(contours)
# co = list(c)
# pprint(co)

# max_height = np.max(c[::, 3])

max_height = 10
# Sort the contours by y-value
by_y = sorted(coords, key=lambda x: x[1])  # y values

line_y = by_y[0][1]       # first y
line = 1
by_line = []

# Assign a line number to each contour
for x, y, w, h in by_y:
    if y > line_y + max_height:
        line_y = y
        line += 1
        
    by_line.append([line, x, y, w, h])
# pprint(by_line)

img = cv2.imread(file)

# This will now sort automatically by line then by x
contours_sorted = [[line, x, y, w, h] for line, x, y, w, h in sorted(by_line)]
# pprint(contours_sorted)
# for x, y, w, h in contours_sorted:
    # print(x,y,x+w,y+h)
    # print(f"{x:4} {y:4} {w:4} {h:4}")
start = 0
# column_names = ['date description']


# pprint(contours_sorted)
# 32, 504, 734, 561

# roi = img[483:521, 31:2360]
# roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
# cv2.imwrite('roi.jpg', roi)
# cv2.imshow('roi', roi)
# cv2.waitKey()

grouped_contours = []
subgroup = []

subgroup.append(contours_sorted[0])

for i in range(len(contours_sorted)-1):
    if contours_sorted[i][0] == contours_sorted[i+1][0]:
        subgroup.append(contours_sorted[i])
    else:
        subgroup.append(contours_sorted[i])
        grouped_contours.append(subgroup)
        subgroup = []

# pprint(grouped_contours)

for subgroup in grouped_contours:

    ## error handling if -+5 coordinates goes out of scope of image
    try:
        x1 = subgroup[0][1]-5
        y1 = subgroup[0][2]-5
        x2 = subgroup[-1][3]+5
        y2 = subgroup[-1][4]+5

        roi = img[y1:y2, x1:x2]
        roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
        cv2.imwrite('roi.jpg', roi)
    
    except:
        x1 = subgroup[0][1]
        y1 = subgroup[0][2]
        x2 = subgroup[-1][3]
        y2 = subgroup[-1][4]

        roi = img[y1:y2, x1:x2]
        roi = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
        cv2.imwrite('roi.jpg', roi)

    command = 'tesseract roi.jpg output -l eng --psm 4'
    os.system(command)
    text = ''
    with open('output.txt', encoding='utf-8') as f:
        text = f.read()
    # print(text.lower())
    # print(re.search('.*date.*balance.*', text.lower()))
    if re.search('.*pax.*sector.*', text.lower()) or re.search('.*date.*description.*', text.lower()):
        line = subgroup[0][0]
        # print(text)
        break

# pprint(contours_sorted)

flag = 0
req = []

for i in range(start, len(contours_sorted)-1):
    if contours_sorted[i][0] >= line:
        if flag == 0:
            print(contours_sorted[i])
        flag = 1

        # print(contours_sorted[i+1][0]-contours_sorted[i][2])
    if contours_sorted[i+1][1] - contours_sorted[i][3] > 10 and contours_sorted[i][0] == contours_sorted[i+1][0]:
        # cv2.putText(img, "|", ((contours_sorted[i][3]+contours_sorted[i+1][1])//2, (contours_sorted[i][2]+contours_sorted[i][4])//2), cv2.FONT_HERSHEY_SIMPLEX,  
                                    # 1, (0, 0, 0), 1, cv2.LINE_AA)
        img = cv2.line(img, ((contours_sorted[i][3]+contours_sorted[i+1][1])//2, contours_sorted[i][2]+1), ((contours_sorted[i][3]+contours_sorted[i+1][1])//2, contours_sorted[i][4]-1), (0,0,0), 2)
        cv2.imwrite('sample.jpg', img)

img = cv2.imread('sample.jpg')
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite('sample.jpg', img)

command = 'tesseract sample.jpg stdout -l eng --psm 4'
print(os.popen(command).read())
# pprint(req)

  c = np.array(contours)


[36, 125, 1232, 167, 1253]
Invoice Cat H
Original for recipient invoice Ca ~~
a Recess

re

yetyo : GSTIN: 36AAACA0313P2ZV
" bi ,

for Business

PAN Card No. | AAACA0313P
Yatra for Business Private Limited

Corporate Identity No. :

301] Dev Dhanuka Prestige, Plot No -8, U72900DL1962PTC003735

Road No -12, Banjara Hills,

HSN } 998551 |
HYDERABAD-500034, TELANGANA,

INDIA

Entity : COMAKEIT SOFTWARE PVT LTD
Address : PHASE 4 III,ROAD NO. 92 JUBILEE HILLS,PLOT NO. 564/A39,, HYDERABAD-500033,Telangana
State Code :36 | Place of Supply | : Telangana

GSTIN : 36AACCC8032H1ZT | PAN Number | : AACCC8032H
BookingDetails:

Booking Ref. : CT6587937 Booking Date : : 05 Dec 2022

Invoice No. : : AAAIN233681309 Invoice Date : : 05 DEC 2022

Booking Type : Official

Trip Id : TA0512220104742 Im Number

d50ae475963b51a381607bffcdf7b
bc82f7e3edefbef5a73ec2d420274f
b357e

Pax Name Sector Travel Date | Ticket Number | PNR No.
Mr steven Ten | HYD-MAA 09 Dec 2022 YNCBKS YNCBKS.

Airline GST (collected on 

for Business

tye oe
Bete ay GSTIN: 36AAACA0313P2ZV
raat ieee

PAN Card No. | AAACA0313P
Yatra for Business Private Limited

Corporate Identity No. :

301) Dev Dhanuka Prestige, Plot No -8, U72900DL1962PTC003735

Road No -12, Banjara Hills,

HSN | 998551 |
HYDERABAD-500034, TELANGANA,

INDIA

Entity : COMAKEIT SOFTWARE PVT LTD
Address : PHASE J III,ROAD NO. 92JUBILEE HILLS,PLOT NO. 564/A39,, HYDERABAD-500033, Telangana
State Code 136 | Place of Supply | : Telangana

GSTIN : 86AACCC8032H1ZT PAN Number | : AACCC8032H
BookingDetails:

Booking Ref. : CT5733274 Booking Date : : 25 Jun 2022

Invoice No. : : AAAIN233671639 Invoice Date : : 25 JUN 2022

Booking Type : Official

Trip Id : TA2506220022316 Im Number

9c1081e235d17d7ca0b54aadf52cd
4d791bef8d9f4a181222d5aa7b416
d3b2be

Pax Name Sector Travel Date Ticket Number | PNR No.
MRRANJEETH | HYD-DXB 02 Jul 2022 5253255420 | JAS282 134139.00
DIDIGUM(ADT) | â€” (EK-527)

DXB-AMS 02 Jul 2022

(EK-149)

AMS-DXB 30 Jul 2022

(EK-148)

DXB-HYD 31

In [15]:
def extract_name(text):
    for line in text:
        if 'name' in line.lower():
            if 'pax' in line.lower():
                g

for file in glob.glob('preprocessed2\\*.txt'):
    with open(file, 'r', encoding='utf-8') as fp:
        text = fp.read()
    text = text.splitlines()
    text = [line for line in text if not line.isspace() and len(line) > 0]
    filename = os.path.split(file)[-1].split('.txt')[0]
    # print(filename)

    name = extract_name(text)

Pax Name Sector Travel Date Ticket Number PNR No.
Pax Name Sector Travel Date Ticket Number PNR No. a
Pax Name Sector Travel Date Ticket Number PNR No. Fare
Pax Name Sector Travel Date
Name - MR KIRTY RAJA KIRAN MADHUNAPANTULA (ADT
Pax Name Sector Travel Date Ticket Number PNR No. Fare
Name - MR KIRTY RAJA KIRAN MADHUNAPANTULA (ADT
Pax Name Sector Travel Date Ticket Number PNR No. Fad
Name - MR PURUSHOTHAMA GOPI KRISHNAM RAJU GOKARAJU (ADT
Pax Name Sector Travel Date Ticket Number PNR No. Fare
Name - Mr STEVEN TEN NAPEL (ADT
Pax Name Sector Travel Date Ticket Number PNR No. Fare
Name - MR _KIRTY RAJA KIRAN MADHUNAPANTULA (ADT)
Pax Name Sector Travel Date Ticket Number PNR No. Fare
Name: Mr Praveen Madala
Pax Name Sector Travel Date Ticket No. PNR No. Fare
Name: Mr Varun Reddy ReddyRedd
Pax Name Sector Travel Date Ticket No. PNR No. Fare
Name: Mr Mahendar Myakala
Pax Name Sector Travel Date Ticket No. PNR No. Fare
Pax Name Sector Travel Date Ticket No. PNR No. Fare
Pax Name Sector Trave

In [None]:
from prettyprinter import pprint
import glob
import airportsdata


def extract_entity(text):
    flag = 0
    for line in text:
        if 'entity' in line.lower():
            flag = 1
            if len(line.split()) > 1:
                return 1
            else:
                return 0
    if flag == 0:
        return 0

def extract_name(text):
    flag = 0
    for line in text:
        if 'name' in line.lower():
            if 'sector' in line.lower():
                return 0
            if len(line.split()) > 2:
                # print(line)
                return 1
            else:
                return 0
    if flag == 0:
        return 0

def extract_start_date(text):
    flag = 0
    for line in text:
        if 'start' in line.lower() or 'travel date' in line.lower():
            flag = 1
        if flag == 1:
            return 1
    if flag == 0:
        return 0

def extract_end_date(text):
    flag = 0
    for line in text:
        if 'end' in line.lower():
            flag = 1
        
        if flag == 1:
            return 1
        # if 'end' is not present in line,
        # it means the end date is not present in the document only,
        # so it should be considered as accurate
        else:
            return 1

def extract_distance(text):
    flag = 0
    for line in text:
        if 'distance' in line.lower():
            flag = 1
        
        if flag == 1:
            return 1

    if flag == 0:
        return 0

def extract_distance_unit(text):
    dis = extract_distance(text)
    if dis:
        return 1

    else:
        return 0

def extract_source_city(text):
    airports = airportsdata.load('IATA')
    flag = 0
    for i in range(len(text)-1):
        if 'sector' in text[i].lower():
            for city_code in airports.keys():
                if city_code.lower() in text[i+1].lower():
                    flag = 1
                    return 1
            if flag == 0:
                return 0
        elif 'city' in text[i].lower():
            return 1
    if flag == 0:
        return 0

def extract_destination_city(text):
    airports = airportsdata.load('IATA')
    flag = 0
    for i in range(len(text)-1):
        if 'sector' in text[i].lower():
            for city_code in airports.keys():
                if city_code.lower() in text[i+1].lower():
                    flag = 1
                    return 1
            if flag == 0:
                return 0
    if flag == 0:
        return 0

def extract_source_country(text):
    source_city = extract_source_city(text)
    if source_city:
        return 1
    else:
        return 0

def extract_destination_country(text):
    destination_city = extract_destination_city(text)
    if destination_city:
        return 1
    else:
        return 0

matrix = {}
for file in glob.glob('preprocessed3\\*.txt'):
    with open(file, 'r', encoding='utf-8') as fp:
        text = fp.read()
    text = text.splitlines()
    text = [line for line in text if not line.isspace() and len(line) > 0]
    filename = os.path.split(file)[-1].split('.txt')[0]

    # flag = 0
    # for line in text:
    #     if 'product' in line.lower():
    #         flag = 1

    # if flag == 1:
    #     continue
    
    # print(file)
    matrix[filename] = []
    matrix[filename].append(extract_entity(text))
    matrix[filename].append(extract_name(text))
    matrix[filename].append(extract_start_date(text))
    matrix[filename].append(extract_end_date(text))
    matrix[filename].append(extract_distance(text))
    matrix[filename].append(extract_distance_unit(text))
    matrix[filename].append(extract_source_city(text))
    matrix[filename].append(extract_destination_city(text))
    matrix[filename].append(extract_source_country(text))
    matrix[filename].append(extract_destination_country(text))

    # pprint(text)
pprint(matrix)

In [11]:
name = []
for value in matrix.values():
    name.append(value[0])

print(name.count(1)/len(name))

0.7048192771084337


In [3]:
from PIL import Image, ImageEnhance
import cv2
import numpy as np
from skimage import io
from skimage.transform import rotate
from skimage.color import rgb2gray
from deskew import determine_skew
from matplotlib import pyplot as plt
from pdf2image import convert_from_path
import shutil
import glob

def set_dpi(image, preproc_image):
    im = Image.open(image)
    im.save(preproc_image, dpi=(300,300))
    return preproc_image

def increase_contrast(image, preproc_image):
    im = Image.open(image)
    enhancer = ImageEnhance.Contrast(im)
    factor = 2 #increase contrast
    im_output = enhancer.enhance(factor)
    im_output.save(preproc_image)
    return preproc_image

def increase_sharpness(image, preproc_image):
    im = Image.open(image)
    enhancer = ImageEnhance.Sharpness(im)
    factor = 2
    im_output = enhancer.enhance(factor)
    im_output.save(preproc_image)
    return preproc_image

def convert_to_grayscale(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def convert_to_rgb(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def otsu_threshold(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    # im = cv2.threshold(im, 120, 255, cv2.THRESH_BINARY, cv2.THRESH_OTSU)[1]
    im = cv2.adaptiveThreshold(im,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def remove_noise(image, preproc_image):
    im = cv2.imread(image)
    im = cv2.fastNlMeansDenoising(im, None, 20, 7, 21)
    cv2.imwrite(preproc_image, im)
    return preproc_image

def deskew(image):
    im = io.imread(image)
    angle = determine_skew(im)
    rotated = rotate(im, angle, resize=True) * 255
    io.imsave(image, rotated.astype(np.uint8))
    return image

def pdf_to_images(pdf, filename):
    pdf = pdf.replace('\\', '\\\\')
    # print(pdf)
    images = convert_from_path(pdf)
    for i in range(len(images)):
        images[i].save('images2\\{}_page{}'.format(filename, i) +'.jpg', 'JPEG')

Writing preprocessing_methods.py


In [4]:
import os

path ="C:\\Users\\Lenovo\\Downloads\\NetZero\\NetZero\\Business Travel Emission Management"

#store all the file names in this list
filelist = []

for root, dirs, files in os.walk(path):
	for file in files:
        #append the file name to the list
		filelist.append(os.path.join(root,file))


# print all the file names
for file in filelist:
    filename = os.path.split(file)[-1].split('.pdf')[0]
    if ' ' in filename:
        filename = filename.replace(' ', '_')
    # print(filename)
    pdf_to_images(file, filename)

Writing pdf_to_images.py


In [5]:
# combinations = ['set_dpi-convert_to_grayscale', 'set_dpi-increase_contrast-convert_to_grayscale', 'set_dpi-convert_to_grayscale-remove_noise', 'set_dpi-increase_contrast-convert_to_grayscale-remove_noise', 'set_dpi-convert_to_grayscale-deskew', 'set_dpi-increase_contrast-convert_to_grayscale-deskew', 'set_dpi-convert_to_grayscale-remove_noise-deskew', 'set_dpi-increase_contrast-convert_to_grayscale-remove_noise-deskew']
combinations = ['set_dpi-increase_contrast-increase_sharpness-remove_noise']

for image in glob.glob('images2\\*'):
    filename = os.path.split(image)[-1].split('.jpg')[0]
    preproc_image = 'preprocessed2\\'+filename+'.jpg'
    shutil.copy(image, preproc_image)
    for combination in combinations:
        methods = combination.split('-')
        for method in methods:
            if 'deskew' in method:
                preproc_image = eval(method+'(preproc_image)')
            else:
                preproc_image = eval(method+'(preproc_image, preproc_image)')
        # print(combination)
        image_name = filename+'_'+combination+'.jpg'
        img = cv2.imread(preproc_image)
        cv2.imwrite(preproc_image, img)

        parsed_output = 'preprocessed2\\'+filename+'_'+combination
        command = 'tesseract {} {} -l eng --psm 4'.format(preproc_image, parsed_output)
        os.system(command)

Writing apply_preprocessing.py


In [66]:
count = 0
output = []
for file in glob.glob('preprocessed_images\\*'):
    count += 1
    output.append(file)
# print(output)

print(count)

# preproc = []
# for file in glob.glob('preprocessed_images\\*'):
#     preproc.append(file)

# for i in range(len(output)):
#     print(output[i]+'---->'+preproc[i])


154


In [26]:

from prettyprinter import pprint
import glob
import airportsdata
import pandas as pd

def extract_entity(text):
    flag = 0
    for line in text:
        if 'entity' in line.lower():
            flag = 1
            if len(line.split()) > 1:
                return 1
            else:
                return 0
    if flag == 0:
        return 0

def extract_name(text):
    flag = 0
    for line in text:
        if 'name' in line.lower():
            if 'sector' in line.lower():
                return 0
            if len(line.split()) > 2:
                print(line)
                return 1
            else:
                return 0
    if flag == 0:
        return 0

def extract_start_date(text):
    flag = 0
    for line in text:
        if 'start' in line.lower() or 'travel date' in line.lower():
            flag = 1
        if flag == 1:
            return 1
    if flag == 0:
        return 0

def extract_end_date(text):
    flag = 0
    for line in text:
        if 'end' in line.lower():
            flag = 1
        
        if flag == 1:
            return 1
        # if 'end' is not present in line,
        # it means the end date is not present in the document only,
        # so it should be considered as accurate
        else:
            return 1

def extract_distance(text):
    flag = 0
    for line in text:
        if 'distance' in line.lower():
            flag = 1
        
        if flag == 1:
            return 1

    if flag == 0:
        return 0

def extract_distance_unit(text):
    dis = extract_distance(text)
    if dis:
        return 1

    else:
        return 0

def extract_source_city(text):
    airports = airportsdata.load('IATA')
    flag = 0
    for i in range(len(text)-1):
        if 'sector' in text[i].lower():
            for city_code in airports.keys():
                if city_code.lower() in text[i+1].lower():
                    flag = 1
                    return 1
            if flag == 0:
                return 0
        elif 'city' in text[i].lower():
            return 1
    if flag == 0:
        return 0

def extract_destination_city(text):
    airports = airportsdata.load('IATA')
    flag = 0
    for i in range(len(text)-1):
        if 'sector' in text[i].lower():
            for city_code in airports.keys():
                if city_code.lower() in text[i+1].lower():
                    flag = 1
                    return 1
            if flag == 0:
                return 0
    if flag == 0:
        return 0

def extract_source_country(text):
    source_city = extract_source_city(text)
    if source_city:
        return 1
    else:
        return 0

def extract_destination_country(text):
    destination_city = extract_destination_city(text)
    if destination_city:
        return 1
    else:
        return 0

matrix = {}
for file in glob.glob('preprocessed2\\*.txt'):
    with open(file, 'r', encoding='utf-8') as fp:
        text = fp.read()
    text = text.splitlines()
    text = [line for line in text if not line.isspace() and len(line) > 0]
    filename = os.path.split(file)[-1].split('.txt')[0]

    flag = 0
    for line in text:
        if 'product' in line.lower():
            flag = 1

    if flag == 1:
        continue
    
    print(file)
    matrix[filename] = []
    matrix[filename].append(extract_entity(text))
    matrix[filename].append(extract_name(text))
    matrix[filename].append(extract_start_date(text))
    matrix[filename].append(extract_end_date(text))
    matrix[filename].append(extract_distance(text))
    matrix[filename].append(extract_distance_unit(text))
    matrix[filename].append(extract_source_city(text))
    matrix[filename].append(extract_destination_city(text))
    matrix[filename].append(extract_source_country(text))
    matrix[filename].append(extract_destination_country(text))

    # pprint(text)
pprint(matrix)

keys = ['office_name', 'traveler_name', 'start_date', 'end_date', 'distance', 'distance_unit', 'source_city', 'source_country', 'destination_city', 'destination_country']
# print(len(keys))

df = pd.DataFrame(matrix)
df.index = keys
df.to_csv('key-value.csv')
print(df)

preprocessed2\113_Yatra_AAAIN233676709_27.09.2022_page0_set_dpi-increase_contrast-increase_sharpness-remove_noise.txt
preprocessed2\114_Yatra__AAAIN233676710_27.09.2022_page0_set_dpi-increase_contrast-increase_sharpness-remove_noise.txt
preprocessed2\118_Yatra_AAAIN233676717_27.09.2022_page0_set_dpi-increase_contrast-increase_sharpness-remove_noise.txt
preprocessed2\119_Yatra_AAAIN233676781_28.09.2022_page0_set_dpi-increase_contrast-increase_sharpness-remove_noise.txt
preprocessed2\71_Yatra_AAAIN233675430_08.09.2022_page0_set_dpi-increase_contrast-increase_sharpness-remove_noise.txt
Name - MR KIRTY RAJA KIRAN MADHUNAPANTULA (ADT
preprocessed2\73_Yatra_AAAIN233675514_09.09.2022_page0_set_dpi-increase_contrast-increase_sharpness-remove_noise.txt
Name - MR KIRTY RAJA KIRAN MADHUNAPANTULA (ADT
preprocessed2\74_Yatra_AAAIN233675535_10.09.2022_page0_set_dpi-increase_contrast-increase_sharpness-remove_noise.txt
Name - MR PURUSHOTHAMA GOPI KRISHNAM RAJU GOKARAJU (ADT
preprocessed2\75_Yatra_AAA

In [None]:


keys = ['office_name', 'traveler_name', 'start_date', 'end_date', 'distance', 'distance_unit', 'source_city', 'source_country', 'destination_city', 'destination_country']
# print(len(keys))

df = pd.DataFrame(matrix)
df.index = keys
df.to_csv('key-value.csv')
print(df)

In [None]:
for values in matrix.values():
    print(values[0])

In [None]:
for file in glob.glob('preprocessed2\\*.txt'):

    with open(file, 'r', encoding='utf-8') as fp:
        text = fp.read()
    text = text.splitlines()
    text = [line for line in text if not line.isspace() and len(line) > 0]
    # pprint(text)

    for line in text:
        if 'product' in line.lower():
            print(file)

In [None]:
import pandas as pd

df = pd.DataFrame(matrix)
df.index = keys
df.to_csv('key-value.csv')
print(df)

In [81]:
f1 = [0,0,1,1,0,1,1,1,0,1]
f2 = [1,1,1,1,0,0,1,1,0,1]
f3 = [0,1,0,1,0,1,1,0,0,1]
f4 = [1,1,1,1,0,1,1,0,0,1]
f5 = [0,1,1,1,1,1,1,1,0,1]

In [88]:
import pandas as pd

dictionary = {
    'file1': f1,
    'file2': f2,
    'file3': f3,
    'file4': f4,
    'file5': f5
}

df = pd.DataFrame(dictionary)
df.index = keys
df.to_csv('key-value.csv')
print(df)

                     file1  file2  file3  file4  file5
office_name              0      1      0      1      0
traveler_name            0      1      1      1      1
start_date               1      1      0      1      1
end_date                 1      1      1      1      1
distance                 0      0      0      0      1
distance_unit            1      0      1      1      1
source_city              1      1      1      1      1
source_country           1      1      0      0      1
destination_city         0      0      0      0      0
destination_country      1      1      1      1      1


In [85]:
df.index = list("ABCDEFGHIJ")

In [86]:
df

Unnamed: 0,file1,file2,file3,file4,file5
A,0,1,0,1,0
B,0,1,1,1,1
C,1,1,0,1,1
D,1,1,1,1,1
E,0,0,0,0,1
F,1,0,1,1,1
G,1,1,1,1,1
H,1,1,0,0,1
I,0,0,0,0,0
J,1,1,1,1,1


In [7]:
import airportsdata
from prettyprinter import pprint

airports = airportsdata.load('IATA')
pprint(airports['DEL'])

{
    'icao': 'VIDP',
    'iata': 'DEL',
    'name': 'Indira Gandhi International Airport',
    'city': 'New Delhi',
    'subd': 'NCT',
    'country': 'IN',
    'elevation': 777.0,
    'lat': 28.5664997101,
    'lon': 77.1031036377,
    'tz': 'Asia/Kolkata',
    'lid': ''
}


In [19]:
from pyairports.airports import Airports

airports = Airports()
airports.airport_iata(iata)

pprint(airports)

NameError: name 'iata' is not defined

In [20]:
import cv2
import os
import re

data = []

def preprocess(image):
    img = cv2.imread(image)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cv2.imwrite('new.jpg', img)
    return 'new.jpg'

def get_box_file(image, output_path):
    command = 'tesseract {} {} batch.nochop makebox'.format(image, output_path)
    os.system(command)

def get_parsed_output(image, output_file):
    command = 'tesseract {} {} -l eng --psm 4'.format(image, output_file)
    os.system(command)

def store_separate_lines(box_file):
    '''
    Method to separate lines along with box cooridnates
    into nested list.
    '''

    text = ''
    with open(box_file, 'r', encoding='utf-8') as fp:
        text = fp.read()

    text = text.splitlines()

    ch_coords = []

    for ch in text:
        ch_split = ch.split()
        ch_coords.append(ch_split)

    lines = []
    line = []
    max_diff = [1]

    for i in range(len(ch_coords)-1):
        c = ch_coords[i][0]

        if int(ch_coords[i][1]) <= int(ch_coords[i+1][1]) or int(ch_coords[i][1]) - int(ch_coords[i+1][1]) in max_diff:
            line.append(ch_coords[i])
        else:
            line.append(ch_coords[i])
            lines.append(line)
            line = []

    return lines

def add_delimiters(image, lines):
    img = cv2.imread(image)
    ih, iw, _ = img.shape
    flag = 0
    columns = ['paxnamesector']

    reference_pts = []

    for line in lines:
        newline = ''.join([line[i][0] for i in range(len(line))])
        # print(newline)
        for column in columns:
            # print(column)
            if column in newline.lower():
                # print(newline)
                flag = 1
            if flag == 1:
                for i in range(len(line)-1):
                    if int(line[i+1][1])-int(line[i][1]) > 55:
                        # cv2.putText(img, "|", ((int(line[i][3])+int(line[i+1][1]))//2, ih-(int(line[i][2]))), cv2.FONT_HERSHEY_SIMPLEX,  
                                                        # 1, (0, 0, 0), 3, cv2.LINE_AA)
                        img = cv2.line(img, ((int(line[i][3])+int(line[i+1][1]))//2, ih-(int(line[i][2]))+3), ((int(line[i][3])+int(line[i+1][1]))//2, ih-(int(line[i][4]))-3), (0,0,0), 2)
                        cv2.imwrite('new.jpg', img)
                        reference_pts.append(line[i][1])
                        
    return 'new.jpg'

def convert_text_to_separate_lines(output_file):
    '''
    Method to store text into lists of lines where each list has
    strings that are segregated by the delimiters.
    '''

    text = ''
    with open(output_file+'.txt', 'r', encoding='utf-8') as fp:
        text = fp.read()

    text = text.splitlines()

    spaces = ['', ' ']
    text = [st for st in text if st not in spaces]

    # pprint(text)

    i = 0
    for line in text:
        if re.search('.*pax.*sector.*', line.lower()):
            text = text[i:]
            break
        i += 1

    for j in range(len(text)):
        if '_' in text[j]:
            text[j] = text[j].replace('_', ' ')
        text[j] = text[j].split(' | ')
    
    return text

def get_table(lines):
    '''
    Method to get table coordinates where the rows are divided into separate lists as per the strings' respective columns
    '''
    i = 0
    for line in lines:
        i += 1
        newline = ''.join([line[i][0] for i in range(len(line))])
        # print(newline)
        if re.search('.*pax.*sector.*', newline.lower()):
            column_names = newline.lower().split('|')
            break
            # print(i)

    table = []
    table = lines[i-1:]
    extras = ['~', '_']

    # pprint(table)
    # k = 0

    for i in range(len(table)):
        table[i] = [ch for ch in table[i] if ch[0] not in extras]

    # pprint(table)

    cols = []
    col = []

    for t in table:
        row = []
        for ch in t:
            col.append(ch)
            if ch[0] == '|' or ch == t[-1]:
                if ch[0] == '|':
                    col.remove(ch)
                row.append(col)
                col = []
        cols.append(row)
    
    pprint(cols)
    
    return cols

def merge_rows(column_coords, columns, line_items, line_item_coords, index):
    row = {}
    
    for col in column_coords:
        flag = 0
        for word in line_item_coords:
            if abs(int(col[0][1])-int(word[0][1])) <= 5:
                # print(columns[column_coords.index(col)]+'---->'+line_items[index][line_item_coords.index(word)])
                # print(line_items[index][line_item_coords.index(word)])
                # data.append({columns[column_coords.index(col)]: line_items[index][line_item_coords.index(word)]})
                flag = 1
                row[columns[column_coords.index(col)]] = line_items[index][line_item_coords.index(word)]
                # break
            elif abs(int(col[-1][1])-int(word[-1][1])) <= 20:
                # print(line_items[index][line_item_coords.index(word)])
                # print(columns[column_coords.index(col)]+'---->'+line_items[index][line_item_coords.index(word)])
                # data.append({columns[column_coords.index(col)]: line_items[index][line_item_coords.index(word)]})
                flag = 1
                row[columns[column_coords.index(col)]] = line_items[index][line_item_coords.index(word)]
            elif flag == 0:
                row[columns[column_coords.index(col)]] = None
                
    data.append(row)
    return data

def split_filename(image):
    filename = os.path.split(image)[-1].split('.')[0]
    return filename

import cv2
import os
import re

data = []

def preprocess(image):
    img = cv2.imread(image)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cv2.imwrite('new.jpg', img)
    return 'new.jpg'

def get_box_file(image, output_path):
    command = 'tesseract {} {} batch.nochop makebox'.format(image, output_path)
    os.system(command)

def get_parsed_output(image, output_file):
    command = 'tesseract {} {} -l eng --psm 3'.format(image, output_file)
    os.system(command)

def store_separate_lines(box_file):
    '''
    Method to separate lines along with box cooridnates
    into nested list.
    '''

    text = ''
    with open(box_file, 'r', encoding='utf-8') as fp:
        text = fp.read()

    text = text.splitlines()

    ch_coords = []

    for ch in text:
        ch_split = ch.split()
        ch_coords.append(ch_split)

    lines = []
    line = []
    max_diff = [1]

    for i in range(len(ch_coords)-1):
        c = ch_coords[i][0]

        if int(ch_coords[i][1]) <= int(ch_coords[i+1][1]) or int(ch_coords[i][1]) - int(ch_coords[i+1][1]) in max_diff:
            line.append(ch_coords[i])
        else:
            line.append(ch_coords[i])
            lines.append(line)
            line = []

    return lines

def add_delimiters(image, lines):
    img = cv2.imread(image)
    ih, iw, _ = img.shape
    flag = 0
    columns = ['paxnamesector']

    reference_pts = []

    for line in lines:
        newline = ''.join([line[i][0] for i in range(len(line))])
        # print(newline)
        for column in columns:
            # print(column)
            if column in newline.lower():
                # print(newline)
                flag = 1
            if flag == 1:
                for i in range(len(line)-1):
                    if int(line[i+1][1])-int(line[i][1]) > 50:
                        # cv2.putText(img, "|", ((int(line[i][3])+int(line[i+1][1]))//2, ih-(int(line[i][2]))), cv2.FONT_HERSHEY_SIMPLEX,  
                                                        # 1, (0, 0, 0), 3, cv2.LINE_AA)
                        img = cv2.line(img, ((int(line[i][3])+int(line[i+1][1]))//2, ih-(int(line[i][2]))+3), ((int(line[i][3])+int(line[i+1][1]))//2, ih-(int(line[i][4]))-3), (0,0,0), 2)
                        cv2.imwrite('new.jpg', img)
                        reference_pts.append(line[i][1])
                        
    return 'new.jpg'

def convert_text_to_separate_lines(output_file):
    '''
    Method to store text into lists of lines where each list has
    strings that are segregated by the delimiters.
    '''

    text = ''
    with open(output_file+'.txt', 'r', encoding='utf-8') as fp:
        text = fp.read()

    text = text.splitlines()

    spaces = ['', ' ']
    text = [st for st in text if st not in spaces]

    # pprint(text)

    i = 0
    for line in text:
        if re.search('.*pax.*sector.*', line.lower()):
            text = text[i:]
            break
        i += 1

    for j in range(len(text)):
        if '_' in text[j]:
            text[j] = text[j].replace('_', ' ')
        text[j] = text[j].split(' | ')
    
    return text

def get_table(lines):
    '''
    Method to get table coordinates where the rows are divided into separate lists as per the strings' respective columns
    '''
    i = 0
    for line in lines:
        i += 1
        newline = ''.join([line[i][0] for i in range(len(line))])
        # print(newline)
        if re.search('.*pax.*sector.*', newline.lower()):
            column_names = newline.lower().split('|')
            break
            # print(i)

    table = []
    table = lines[i-1:]
    extras = ['~', '_']

    # pprint(table)
    # k = 0

    for i in range(len(table)):
        table[i] = [ch for ch in table[i] if ch[0] not in extras]

    # pprint(table)

    cols = []
    col = []

    for t in table:
        row = []
        for ch in t:
            col.append(ch)
            if ch[0] == '|' or ch == t[-1]:
                if ch[0] == '|':
                    col.remove(ch)
                row.append(col)
                col = []
        cols.append(row)
    
    # pprint(cols)
    
    return cols

def merge_rows(column_coords, columns, line_items, line_item_coords, index):
    row = {}
    
    for col in column_coords:
        flag = 0
        for word in line_item_coords:
            if abs(int(col[0][1])-int(word[0][1])) <= 5:
                print(columns[column_coords.index(col)]+'---->'+line_items[index][line_item_coords.index(word)])
                # print(line_items[index][line_item_coords.index(word)])
                # data.append({columns[column_coords.index(col)]: line_items[index][line_item_coords.index(word)]})
                flag = 1
                row[columns[column_coords.index(col)]] = line_items[index][line_item_coords.index(word)]
                # break
            elif abs(int(col[-1][1])-int(word[-1][1])) <= 20:
                # print(line_items[index][line_item_coords.index(word)])
                print(columns[column_coords.index(col)]+'---->'+line_items[index][line_item_coords.index(word)])
                # data.append({columns[column_coords.index(col)]: line_items[index][line_item_coords.index(word)]})
                flag = 1
                row[columns[column_coords.index(col)]] = line_items[index][line_item_coords.index(word)]
            elif flag == 0:
                row[columns[column_coords.index(col)]] = None
                
    data.append(row)
    return data

def split_filename(image):
    filename = os.path.split(image)[-1].split('.jpg')[0]
    return filename

In [22]:
from prettyprinter import pprint
import glob

if __name__ == '__main__':
    
    # image = 'C:\\air_ticket\\images2\\74_Yatra_AAAIN233675535_10.09.2022_page0.jpg'
    for image in glob.glob('C:\\air_ticket\\preprocessed2\\113_Yatra_AAAIN233676709_27.09.2022_page0.jpg'):
        filename = split_filename(image)

        # image = preprocess(image)

        output_path = 'box_files\\{}'.format(filename)
        get_box_file(image, output_path)

        box_file = 'box_files\\{}.box'.format(filename)
        lines = store_separate_lines(box_file)

        image = add_delimiters(image, lines)

        image = cv2.imread(image)
        cv2.imwrite('delimiters\\'+filename+'.jpg', image)

        delimiter_image = 'delimiters\\'+filename+'.jpg'

        output_file = 'delimiters\\{}_psm3'.format(filename)
        get_parsed_output(delimiter_image, output_file)

    # image = preprocess(image)

    # filename = split_filename(image)
    # output_path = 'box_files\\{}'.format(filename)
    # get_box_file(image, output_path)
    # box_file = 'box_files\\{}.box'.format(filename)
        lines = store_separate_lines(box_file)

    # output_file = 'parsed_output\\{}'.format(filename)
    # get_parsed_output(image, output_file)

        text = convert_text_to_separate_lines(output_file)

        cols = get_table(lines)
    ## pprint(cols)

        column_coords = cols[0]
        line_items_coords = cols[1:]

        columns = text[0]
        line_items = text[1:]

        for i in range(len(line_items_coords)):
            if len(columns) != len(line_items[i]):
                merge_rows(column_coords, columns, line_items, line_items_coords[i], i)
                # print('-------------------------')

    # pprint(data)  


Pax Name Sector Travel Date Ticket Number---->HYD-GOI 03Nov2022
Pax Name Sector Travel Date Ticket Number---->Kukkanti(ADT)


In [29]:
columns = ['PaxName', 'Sector', 'TravelDate', 'TicketNumber', ]

command = 'tesseract C:\\air_ticket\\images2\\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg -l eng tsv'
os.system(command)

0

In [None]:
command = 'tesseract new.jpg stdout -l eng --psm 4'
print(os.popen(command).read())

In [26]:
## cut the image row-wise


import sys
import math
import cv2 as cv
import numpy as np

coords = []

def is_vertical(line):
    return line[0]==line[2]

def is_horizontal(line):
    return line[1]==line[3]
    
def overlapping_filter(lines, sorting_index):
    filtered_lines = []
    
    lines = sorted(lines, key=lambda lines: lines[sorting_index])
    
    for i in range(len(lines)):
            l_curr = lines[i]
            if(i>0):
                l_prev = lines[i-1]
                if ( (l_curr[sorting_index] - l_prev[sorting_index]) > 5):
                    filtered_lines.append(l_curr)
            else:
                filtered_lines.append(l_curr)
                
    return filtered_lines
               
def detect_lines(image, title='default', rho = 1, theta = np.pi/180, threshold = 50, minLinLength = 290, maxLineGap = 6, display = False, write = False):
    # Check if image is loaded fine
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    
    if gray is None:
        print ('Error opening image!')
        return -1
    
    dst = cv.Canny(gray, 50, 150, None, 3)
    
    # Copy edges to the images that will display the results in BGR
    cImage = np.copy(image)
    
    #linesP = cv.HoughLinesP(dst, 1 , np.pi / 180, 50, None, 290, 6)
    linesP = cv.HoughLinesP(dst, rho , theta, threshold, None, minLinLength, maxLineGap)
    
    horizontal_lines = []
    vertical_lines = []
    
    if linesP is not None:
        #for i in range(40, nb_lines):
        for i in range(0, len(linesP)):
            l = linesP[i][0]

            if (is_vertical(l)):
                vertical_lines.append(l)
                
            elif (is_horizontal(l)):
                horizontal_lines.append(l)
        
        horizontal_lines = overlapping_filter(horizontal_lines, 1)
        vertical_lines = overlapping_filter(vertical_lines, 0)
            
    if (display):
        for i, line in enumerate(horizontal_lines):
            print("Horizontal --> ", line)
            cv.line(cImage, (line[0], line[1]), (line[2], line[3]), (0,255,255), 3, cv.LINE_AA)
            
            # cv.putText(cImage, str(i) + "h", (line[0] + 5, line[1]), cv.FONT_HERSHEY_SIMPLEX,  
                       # 0.5, (0, 0, 0), 1, cv.LINE_AA) 
            
        for i, line in enumerate(vertical_lines):
            print("Vertical --> ", line)
            cv.line(cImage, (line[0], line[1]), (line[2], line[3]), (0,255,255), 3, cv.LINE_AA)
            # cv.putText(cImage, str(i) + "v", (line[0], line[1] + 5), cv.FONT_HERSHEY_SIMPLEX,  
                       # 0.5, (0, 0, 0), 1, cv.LINE_AA) 
        # cv.imshow("Source", cImage)
        #cv.imshow("Canny", cdstP)
        # cv.waitKey(0)
        # cv.destroyAllWindows()
        
    if (write):
        print("DONE")
        cv.imwrite(title + ".png", cImage)
        
    return (horizontal_lines, vertical_lines)

def get_cropped_image(image, x, y, w, h):
    cropped_image = image[ y:y+h , x:x+w ]
    return cropped_image
    
def get_ROI(image, horizontal, vertical, left_line_index, right_line_index, top_line_index, bottom_line_index, offset=4):
    x1 = vertical[left_line_index][2] + offset
    y1 = horizontal[top_line_index][3] + offset
    x2 = vertical[right_line_index][2] - offset
    y2 = horizontal[bottom_line_index][3] - offset
    
    w = x2 - x1
    h = y2 - y1
    
    cropped_image = get_cropped_image(image, x1, y1, w, h)
    
    return cropped_image, (x1, y1, w, h)

def main(argv=[]):
    
    default_file = 'C:\\air_ticket\\images2\\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg'
    filename = argv[0] if len(argv) > 0 else default_file
    print(filename)
    src = cv.imread(cv.samples.findFile(filename))

    # img = cv2.imread("C:\\Users\\Lenovo\\Downloads\\lines.PNG")
    gray = cv.cvtColor(src,cv.COLOR_BGR2GRAY)

    lsd = cv.createLineSegmentDetector(0)
    dlines = lsd.detect(gray)

    for dline in dlines[0]:
        x0 = int(round(dline[0][0]))
        y0 = int(round(dline[0][1]))
        x1 = int(round(dline[0][2]))
        y1 = int(round(dline[0][3]))
        cv.line(src, (x0, y0), (x1,y1), 0, 1, cv.LINE_AA)

        # print line segment length
        a = (x0-x1) * (x0-x1)
        b = (y0-y1) * (y0-y1)
        c = a + b
        #print(math.sqrt(c))

    cv.imwrite('lines.png', src)
    
    # Loads an image
    horizontal, vertical = detect_lines(src, display=True, write=True)
    pprint(horizontal)
    
    return 0
    
if __name__ == "__main__":
    main()


C:\air_ticket\images2\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg
Horizontal -->  [ 848 1425 1186 1425]
Horizontal -->  [1036 1470 1440 1470]
DONE
[
    array([ 848, 1425, 1186, 1425], dtype=int32),
    array([1036, 1470, 1440, 1470], dtype=int32)
]


In [28]:
import sys
import math
import cv2 as cv
import numpy as np

coords = []

def is_vertical(line):
    return line[0]==line[2]

def is_horizontal(line):
    return line[1]==line[3]
    
def overlapping_filter(lines, sorting_index):
    filtered_lines = []
    
    lines = sorted(lines, key=lambda lines: lines[sorting_index])
    
    for i in range(len(lines)):
            l_curr = lines[i]
            if(i>0):
                l_prev = lines[i-1]
                if ( (l_curr[sorting_index] - l_prev[sorting_index]) > 5):
                    filtered_lines.append(l_curr)
            else:
                filtered_lines.append(l_curr)
                
    return filtered_lines
               
def detect_lines(image, title='default', rho = 1, theta = np.pi/180, threshold = 50, minLinLength = 290, maxLineGap = 6, display = False, write = False):
    # Check if image is loaded fine
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    
    if gray is None:
        print ('Error opening image!')
        return -1
    
    dst = cv.Canny(gray, 50, 150, None, 3)
    
    # Copy edges to the images that will display the results in BGR
    cImage = np.copy(image)
    
    #linesP = cv.HoughLinesP(dst, 1 , np.pi / 180, 50, None, 290, 6)
    linesP = cv.HoughLinesP(dst, rho , theta, threshold, None, minLinLength, maxLineGap)
    
    horizontal_lines = []
    vertical_lines = []
    
    if linesP is not None:
        #for i in range(40, nb_lines):
        for i in range(0, len(linesP)):
            l = linesP[i][0]

            if (is_vertical(l)):
                vertical_lines.append(l)
                
            elif (is_horizontal(l)):
                horizontal_lines.append(l)
        
        horizontal_lines = overlapping_filter(horizontal_lines, 1)
        vertical_lines = overlapping_filter(vertical_lines, 0)
    
    if (display):
        for i, line in enumerate(horizontal_lines):
            coords.append(line)
            # print("Horizontal --> ", line)
            cv.line(cImage, (line[0], line[1]), (line[2], line[3]), (0,255,0), 3, cv.LINE_AA)
            
            # cv.putText(cImage, ":", (line[0] + 5, line[1]), cv.FONT_HERSHEY_SIMPLEX,  
            #            0.5, (0, 0, 0), 1, cv.LINE_AA) 
            
        for i, line in enumerate(vertical_lines):
            # print("Vertical --> ", line)
            cv.line(cImage, (line[0], line[1]), (line[2], line[3]), (0,255,0), 3, cv.LINE_AA)
            # cv.putText(cImage, ":", (line[0], line[1] + 5), cv.FONT_HERSHEY_SIMPLEX,  
            #            0.5, (0, 0, 0), 1, cv.LINE_AA) 
        cv.imshow("Source", cImage)
        #cv.imshow("Canny", cdstP)
        cv.waitKey(0)
        cv.destroyAllWindows()
        
    if (write):
        print("DONE")
        cv.imwrite(title + ".jpg", cImage)
        
    return (horizontal_lines, vertical_lines, coords)

def get_cropped_image(image, x, y, w, h):
    cropped_image = image[ y:y+h , x:x+w ]
    return cropped_image
    
def get_ROI(image, horizontal, vertical, left_line_index, right_line_index, top_line_index, bottom_line_index, offset=4):
    x1 = vertical[left_line_index][2] + offset
    y1 = horizontal[top_line_index][3] + offset
    x2 = vertical[right_line_index][2] - offset
    y2 = horizontal[bottom_line_index][3] - offset
    
    w = x2 - x1
    h = y2 - y1
    
    cropped_image = get_cropped_image(image, x1, y1, w, h)
    
    return cropped_image, (x1, y1, w, h)

def main(argv=[]):
    
    # default_file = 'C:\\Users\\Lenovo\\Desktop\\DrawBoxTest\\newimages\\test4_page0.jpg'
    # default_file = 'C:\\Users\\Lenovo\\Desktop\\DrawBoxTest\\thin_rec\\automatic_para\\test4_page0.jpg'
    # default_file = 'defaultnew.jpg'
    default_file = 'C:\\air_ticket\\images2\\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg'
    filename = argv[0] if len(argv) > 0 else default_file
    print(filename)
    src = cv.imread(cv.samples.findFile(filename))
    
    # Loads an image
    horizontal, vertical, coords = detect_lines(src, display=True, write=True)
    pprint(coords)
    
    return 0
    
if __name__ == "__main__":
    main()

C:\air_ticket\images2\71_Yatra_AAAIN233675430_08.09.2022_page0.jpg
DONE
[
    array([1002, 1471, 1316, 1471], dtype=int32),
    array([1071, 1712, 1401, 1712], dtype=int32),
    array([ 990, 1746, 1299, 1746], dtype=int32)
]


In [5]:
import cv2

img = 'C:\\air_ticket\\images2\\113_Yatra_AAAIN233676709_27.09.2022_page0.jpg'
img = cv2.imread(img)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
cv2.imwrite('img.jpg', img)

command = 'tesseract img.jpg stdout -l eng --psm 4'
print(os.popen(command).read())

 

Invoice Category

brisn for recipient B2B)

 

yo

* for Business

 

GSTIN: 36AAACA0313P2ZV

PAN Card No. : AAACA0313P
Yatra for Business Private Limited
Corporate Identity No. :

301, Dev Dhanuka Prestige, Plot No -8, U72900DL1962PTC003735

Road No -12, Banjara Hills,

    

    

 

 

 

 

HSN - 998551 |
HYDERABAD-500034, TELANGANA, Bia
INDIA is a
Fy it Tega
Tax Invoice - Air (Domestic
Entity COMAKEIT SOFTWARE PVT LTD
Address PHASE - III, ROAD NO. 92JUBILEE HILLS,PLOT NO. 564/A39, HYDERABAD-500033, Telangana
State Code 36 Place of Supply Telangana
GSTIN 36AACCC8032H1ZT PAN Number AACCC8032H
BookingDetails: .
Booking Ref. CT6142674 Booking Date 12 Sep 2022
Invoice No. AAAIN233676709 Invoice Date 27 SEP 2022
Booking Type Official
Trip Id TA1209220120464 Irn Number
f7d8cdd553c3f66fb0c9b9bf98e560
7c10c033e36075d9eca2ab8a10d8
5f3951
Name - MR Nagaraju Kukkanti (ADT
Pax Name Sector Travel Date Ticket Number PNR No. Fare
MR Nagaraju HYD-GOI 03 Nov 2022 QJ7N8Y QJ7N8Y 5653.01
Kukkanti(A

In [7]:
import glob
print(len(glob.glob('C:\\air_ticket\\images2\\*')))

151
