In [1]:
import io
import re
import os
import json
import numpy as np
import pandas as pd
from glob import glob
import time

In [2]:
from PIL import Image
import pytesseract
from pytesseract import image_to_string, image_to_osd

In [3]:
all_images = glob(os.path.join('data', '*.jpg'))
df_images = pd.DataFrame({'path': all_images})
df_images.head()

Unnamed: 0,path
0,data\1-1.jpg
1,data\2-1.jpg
2,data\3-1.jpg
3,data\4-1.jpg
4,data\5-1.jpg


In [4]:
df_images['file_name'] = df_images['path'].map(lambda in_path: in_path.split(os.sep)[-1])
df_images['file_type'] = df_images['path'].map(lambda in_path: os.path.splitext(in_path)[1][1:])  # [2]
df_images.head()

Unnamed: 0,path,file_name,file_type
0,data\1-1.jpg,1-1.jpg,jpg
1,data\2-1.jpg,2-1.jpg,jpg
2,data\3-1.jpg,3-1.jpg,jpg
3,data\4-1.jpg,4-1.jpg,jpg
4,data\5-1.jpg,5-1.jpg,jpg


In [5]:
def data_extraction(row):
    im = Image.open(row.path)
    text = pytesseract.image_to_string(im, lang = 'eng')
    return text

In [6]:
df_images['corpus'] = df_images.apply(data_extraction,axis=1)

In [7]:
df_images.head()

Unnamed: 0,path,file_name,file_type,corpus
0,data\1-1.jpg,1-1.jpg,jpg,Emme 40 2432964\n\nz SBI Life Insurance Compan...
1,data\2-1.jpg,2-1.jpg,jpg,Your Renewal Premium Receipt (Provisional) E w...
2,data\3-1.jpg,3-1.jpg,jpg,|\n|\n|\n\n¢FICICI PRLDENTIALT=\n\nPREMIUM PAI...
3,data\4-1.jpg,4-1.jpg,jpg,Eronlgyoe FD- 223GC\n\nMAX Name ~ Rajendra Pro...
4,data\5-1.jpg,5-1.jpg,jpg,SEES ogee\nwR HDFC BANK re 7/3 Opp Rai att\nDF...


In [8]:
df_result = df_images

In [9]:
def rendered(row) :
    out = []
    buff = []
    a = row.corpus
    for c in a:
        if c == '\n':

            out.append(''.join(buff))
            buff = []

        else:
            buff.append(c)
    else:
        if buff:
            out.append(''.join(buff))
    return out

In [10]:
df_result['rendered_text'] = df_result.apply(rendered,axis=1)

In [11]:
words = ['\n'] 
for i in words:
    df_result = df_result.replace(to_replace=i,value=" ",regex=True)

## Extraction Functions

In [12]:
def find_provider_reg(corpus):
    regex=r"(Apollo Munich|Apollo|Apollo Munich|SBI Life|SBI|sbi|EXIDE Life|Exide Life|ICICI PRUDENTIAL|ICICI Lombard|ICICI LOMBARD|Max Life|max life|HDFC Life|LIC of India|Life Insurance Corporation|LIFE INSURANCE CORPORATION|LIC|kotak life|Kotak Life|Kotak Mahindra Life|STAR COMPREHENSIVE INSURANCE|Star Health|NEW INDIA ASSURANCE|NEW INDIA|HDFC ERGO|HDFC ERGO General Insurance|Aegon Life|Bajaj Allianz|Birla Sun Life|Birla Sun Life Insurance)"
    matches = re.finditer(regex, corpus, re.MULTILINE)
    for match in matches:
        return match.group()
    return None

In [13]:
def find_policy(corpus):
    #regex = r" (Policy|policy|P.|p.|P|p|Pol|pol|Pol.|pol.|Master olicy)+\s(No|Num|No.|no.|num|Number|number) \d+"
    regex=r" (Policy|policy|P.|p.|P|p|Pol|pol|Pol.|pol.|Details|Proposal/Policy|Policy|)+\s(No|Num|No.|no.|num|Number|number)(\s|\s+|.|)(:|,|;|-|=|\s|) (\d\w\d+|\d+\w+\d+)"
    matches = re.finditer(regex, corpus, re.MULTILINE)
    for match in matches:
        return match.group()

def find_policy_slash(corpus):
    regex=r" (Policy.No....(\w+|\d+)(.|)(\w+|\d+)(.|)(\w+|\d+)(.|)(\w+|\d+)(.|)(\w+|\d+))"
    matches = re.finditer(regex, corpus, re.MULTILINE)
    for match in matches:
        return match.group()

In [14]:
def find_premium(rendered_text):
    regex = r"(Premium.A.|Premium Amount|Premium amount|Premium|Sum Assured|Total Premium Amount|Amount paid|Total Premium Paid|Gross Premium .Rs.|Gross)..(?: \d\d+)+ (?=\()|\d+,\d+.\d\d"
    for text in rendered_text:
        matches = re.match(regex, text)
        if matches:
            return text
            break
            
    return None

In [15]:
def find_name(rendered_text):
    regex = r"(Mr./Mrs.|Mr|MR|Mr.|Ms|Ms.|Miss|Mrs|Mrs.|Prof|Mr./Mrs.|Name:|Dear MR.|Prof.|Dr|Dr.|Smt.|Ms.|Shri|Sri|Sri.|MR |Policy Holder Name|Premium Payor Name|Policyholder's Name|Life Insured..|Owner Name...|Dear Mr.|following life insurance policy held by|The following premium has been received for life insurance policies from the userid of)(.|..|...|)((\w+ \w+ \w+ \w+)|(\w\w+)+|(\w \w+ \w+))"
    for text in rendered_text:
        matches = re.match(regex, text)
        if matches:
            return text
            break 
            
    return None

In [16]:
def find_date(rendered_text):
    regex = r"((Date|Date |DATE|date|Date of issue|DATE OF ISSUE)(\s|):+.(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+))"
    d = None
    for text in rendered_text:
        matches = re.match(regex, text)
        if matches:
            return text
        
    return None

In [17]:

def valid_premium_word(premium_word):
    try:
        mat=re.match("^((Premium.A.|Premium Amount|Premium amount|Installment premium|Installment Premium|Amt. Collected|Total Premium|Gross Premium .Rs.|Gross))$",premium_word)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def valid_premium_no(premium_no):
    try:
        mat=re.match("^(?: \d\d+)+ (?=\()|\d+|\d+,\d+.\d\d$",premium_no)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def find_premium_ex(corpus):
    words = corpus.split(' ')
    for i in range(0, len(words)):
        if words[i] == valid_premium_word(words[i]):
            for j in range(i, len(words)):
                if words[j] == valid_premium_no(words[j]):
                    return words[j]      
    return None
                

def find_premium_back(rendered_text):
    for i in range(0,len(rendered_text)):
        if rendered_text[i] == valid_premium_word(rendered_text[i]):
            k=i
            for j in range(k, 0, -1):
                 if rendered_text[j] == valid_premium_no(rendered_text[j]):
                        return rendered_text[j]
                        break
    return None

def prem_amt(corpus):
    regex = r"(Premium.A.|Premium Amount|Premium amount|Sum Assured|Total Premium Amount|Amount Paid|Total Premium Paid|Total|Total Amount Rs. |amount of|Amount paid|premium amount of Rs.|Total Premium :Rs|Amt. Collected) ..((\d+.\d\d)|(: \d+.\d\d)|(\d+,\d+.\d\d))"
    
    test_str = corpus
    matches = re.finditer(regex, test_str, re.MULTILINE)

    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        if match:
            return match.group()
    return None

def prem_amt1(corpus):
    regex = r"(Premium.A.|Premium Amount|Premium amount|Sum Assured|Total Premium Amount|Amount Paid|Total Premium Paid|Total|Total Amount Rs. |amount of|Amount paid|premium amount of Rs.|Amt. Collected|Total Premium :Rs|paid...) ((\d+,\d+.\d\d)|(\d+.\d\d)|(\d+))"
    
    test_str = corpus
    matches = re.finditer(regex, test_str, re.MULTILINE)

    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        if match:
            return match.group()
    return None

def prem_recpt(corpus):
    regex = r"(receipt[^\d]+\d+)"
    
    test_str = corpus
    matches = re.finditer(regex, test_str, re.MULTILINE)

    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        if match:
            return match.group()
    return None

In [18]:
def valid_policy_word(policy_word):
    try:
        mat=re.match("^((Policy|policy|P.|p.|P|p|Pol|pol|Pol.|pol.|Master|1 plicy|licy|olicy|icy)+\s(No|Num|No.|no.|num|Number|number|No.Plan|Policy :|Number:))$",policy_word)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def valid_policy_no(policy_no):
    try:
        mat=re.match("(\d\w\d+|\d+\w+\d+|^\d+$)",policy_no)
        #mat=re.match("^\d+$",policy_no)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def find_policy_ex(rendered_text):            
    for i in range(0, len(rendered_text)):
        if rendered_text[i] == valid_policy_word(rendered_text[i]):
            for j in range(i, len(rendered_text)):
                if rendered_text[j] == valid_policy_no(rendered_text[j]):
                    return rendered_text[j]
    return None


def find_policy_back(rendered_text):
    for i in range(0,len(rendered_text)):
        if rendered_text[i] == valid_policy_word(rendered_text[i]):
            k=i
            for j in range(k, 0, -1):
                 if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
                        break
    return None

def find_policy_sep_number(rendered_text):
    for i in range(0,len(rendered_text)):
        if rendered_text[i]=='Policy'and rendered_text[i+1]=='Number':
            for j in range(i, len(rendered_text)):
                    if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
        elif rendered_text[i]=='policy'and rendered_text[i+1]=='number':
            for j in range(i, len(rendered_text)):
                    if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
        elif rendered_text[i]=='Policy'and rendered_text[i+1]=='number':
            for j in range(i, len(rendered_text)):
                    if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
    return None

def find_single_policy(corpus):
    for i in corpus:
        x=re.search("((Policy....)(\d+))",corpus)
        if x:
            return x.group()
            break
    return None

In [19]:
def valid_dateword(date_word):
    try:
        mat=re.match("^((Date|Date:|DATE|date|Date of Receipt|Date of issue|DATE OF ISSUE|Date and Time :|Receipt Date))$",date_word)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def valid_date(date_string):
    try:
        mat=re.match("^(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+)$",date_string)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def find_date_ex(rendered_text):    
    for i in range(0,len(rendered_text)):
        if rendered_text[i]==valid_dateword(rendered_text[i]):
            for j in range(i, len(rendered_text)):
                if rendered_text[j] == valid_date(rendered_text[j]):
                    return rendered_text[j]
    return None 

def find_date_single(rendered_text):
    for i in rendered_text:
        x=re.search("^(0?[1-9]|1[0-9]|2[0-9]|3[0-1])([. \/-])(0?[1-9]|1[0-2]|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)|Apr(?:il)|May|Jun(?:ne)|Jul(?:y)|Aug(?:)|Sep(?:)|Oct(?:)|Nov(?:)|Dec(?:ember)?)|(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)|apr(?:il)|may|jun(?:ne)|jul(?:y)|aug(?:)|sep(?:)|Oct(?:)|nov(?:)|dec(?:ember)?))[(. \-/](19[0-9][0-9]|20[0-9][0-9])$",i)
        if x:
            return x.group()
        elif x is None: 
                x = re.search('(\w+)(\s)(\d+)(,)(\s|)(\d+)', i)
                if x:
                    return x.group()
    return None 

def find_if_date_and_time(corpus):
    for i in corpus:
        x=re.search("((Date of Receipt|DATE OF ISSUE:|Date...Time.|Date..Time...|Date...Time....|Receipt Date|INSURANCE Date:|Payment Date and Time :|Date :|Date:|Date)(\s|)(|:|)+.(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+))",corpus)
        if x:
            return x.group()
            break
    return None

def find_if_date_time(corpus):
    for i in corpus:
        x=re.search("((Date and Time)(\s|)(:)(\s|)(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+))",corpus)
        if x:
            return x.group()
            break
    return None

## Main function

In [20]:
def information_extract(row):
    rendered = row.rendered_text
    corpus = row.corpus
    
    provider = find_provider_reg(corpus)
    
    
    policy_number = find_policy(corpus)
    if policy_number is None:
        policy_number = find_policy_ex(rendered)
        policy_number=policy_number
        if policy_number is None:
            policy_number=find_policy_back(rendered)
            policy_number=policy_number
            if policy_number is None:
                policy_number = find_policy_sep_number(rendered)
                policy_number=policy_number
                if policy_number is None:
                    policy_number = find_single_policy(corpus)
                    policy_number=policy_number
                    if policy_number is None:
                        policy_number = find_policy_slash(corpus)
        
            
        
    premium_amt = find_premium(rendered)
    if premium_amt is None:
        premium_amt = find_premium_ex(corpus)
        premium_amt=premium_amt
        if premium_amt is None:
            premium_amt = prem_amt(corpus)
            premium_amt=premium_amt
            if premium_amt is None:
                premium_amt = prem_amt1(corpus)
                premium_amt=premium_amt
                if premium_amt is None:
                    premium_amt=find_premium_back(rendered)
                    premium_amt=premium_amt
                    if premium_amt is None:
                        premium_amt = prem_recpt(corpus)
            
        
    insured_name = find_name(rendered)
    #insured_name = natural_language(insured_name)
    
    
    premium_date = find_if_date_and_time(corpus)   
    premium_date=premium_date
    if premium_date is None:
        premium_date = find_date(rendered)
        premium_date=premium_date
        if premium_date is None:
            premium_date = find_date_ex(rendered)
            premium_date=premium_date
            if premium_date is None:
                premium_date=find_date_single(rendered)
                premium_date=premium_date
                if premium_date is None:
                    premium_date = find_if_date_time(corpus)
    
        
             
    return pd.Series({'provider':provider, 'policy_number':policy_number, 'premium_amt' : premium_amt, 'insured_name': insured_name, 'premium_date': premium_date})

In [21]:
df_final_result = pd.concat([df_result, df_result.apply(information_extract, axis=1)], axis=1)

In [22]:
words = ['Payment Date and Time :|','INSURANCE ','Master','\n','Smt./Ms.JShri','./Shri','Smt./Ms','lat','http','Mrs.','Policy','Premium Payor','Number','Details','No.','No','Mr.','MR ','MR.','.co.in.','Owner Name','Receipt','Rs.','Dear',':','/Premium Paying','holder','Name','Life Insured','Premium Payor Name','Holder Name','Proposal/','Date','DATE','OF','ISSUE','Date','and','Time','DATE OF ISSUE','Mobile','Smt./Ms./Shri','Holder','The following premium has been received for life insurance policies from the userid of','following life insurance policy held by'] 
for i in words:
    df_final_result = df_final_result.replace(to_replace=i,value="",regex=True)
    

df_final_result['provider']=df_final_result['provider'].str.replace(r"[\(\)\{\}<>/]","")
df_final_result['policy_number']=df_final_result['policy_number'].str.replace(r"[\(\)\{\}<>|]","")
df_final_result['insured_name']=df_final_result['insured_name'].str.replace(r"[\(\)\{\}<>,']","")
df_final_result['premium_date']=df_final_result['premium_date'].str.replace(r"[\(\)\{\}<>|]","")

In [23]:
i=0
for j in df_final_result['premium_amt']:
    if j != None:
        df_final_result.loc[i,'premium_amt']=re.sub('[^0-9,.]', "", df_final_result.loc[i,'premium_amt'])
    i=i+1

In [24]:
k=0
for l in df_final_result['insured_name']:
    if l != None:
        df_final_result.loc[k,'insured_name']=re.sub('[^A-Za-z, ]', "", df_final_result.loc[k,'insured_name'])
    k=k+1

In [25]:
def trimAllColumns(df_final_result):
    trimStrings = lambda x: x.strip() if type(x) is str else x
    return df_final_result.applymap(trimStrings)

df_final_result = trimAllColumns(df_final_result)

In [26]:
for i in range(0,len(df_final_result)):
    try:
        
        df_final_result.loc[i,'premium_date'] = pd.to_datetime(df_final_result.loc[i,'premium_date'],dayfirst=True)
    except ValueError:
        df_final_result.loc[i,'premium_date'] = None

df_final_result['premium_date'] = pd.to_datetime(df_final_result['premium_date'],dayfirst=True).dt.date

In [27]:
outpath = os.getcwd()
filename = "output_"+(time.strftime("%d-%m-%Y")+".csv")
df_final_result[['file_name', 'provider', 'policy_number' , 'premium_amt', 'insured_name','premium_date']].to_csv(outpath + "\\" + filename)