In [1]:
import io
import re
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
%matplotlib inline

import six

from glob import glob
from google.cloud import vision

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

In [2]:
# Set Google API authentication and set folder where images are stored
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'Banking-326c0d0e12c1.json'

client = vision.ImageAnnotatorClient()

In [3]:
all_images = glob(os.path.join('data', '*'))
df_images = pd.DataFrame({'path': all_images})
df_images.head()

Unnamed: 0,path
0,data\1-1.jpg
1,data\2-1.jpg
2,data\3-1.jpg
3,data\4-1.jpg
4,data\5-1.jpg


In [4]:
df_images['file_name'] = df_images['path'].map(lambda in_path: in_path.split(os.sep)[-1])
df_images['file_type'] = df_images['path'].map(lambda in_path: os.path.splitext(in_path)[1][1:])  # [2]
df_images.head()

Unnamed: 0,path,file_name,file_type
0,data\1-1.jpg,1-1.jpg,jpg
1,data\2-1.jpg,2-1.jpg,jpg
2,data\3-1.jpg,3-1.jpg,jpg
3,data\4-1.jpg,4-1.jpg,jpg
4,data\5-1.jpg,5-1.jpg,jpg


In [5]:
def google_vision_it(row):
    with open(row.path, 'rb') as image_file:
        content = image_file.read()
    response = client. document_text_detection({'content': content})  # [1]
    texts = response.text_annotations
    return texts[np.argmax([len(t.description) for t in texts])].description.split('\n')

In [6]:
 df_images['rendered_text'] = df_images.apply(google_vision_it, axis=1)

In [7]:
df_result = df_images
df_result['corpus'] = df_images['rendered_text'].map(lambda l: ' '.join(l))
df_result

Unnamed: 0,path,file_name,file_type,rendered_text,corpus
0,data\1-1.jpg,1-1.jpg,jpg,"[Emplo:92964, SBI Life, SBI Life, SBI Life Ins...",Emplo:92964 SBI Life SBI Life SBI Life Insuran...
1,data\2-1.jpg,2-1.jpg,jpg,"[Your Renewal Premium Receipt (Provisional), E...",Your Renewal Premium Receipt (Provisional) EXI...
2,data\3-1.jpg,3-1.jpg,jpg,"[MICICI PRUDENTIAL, PREMIUM PAID CERTIFICATE, ...",MICICI PRUDENTIAL PREMIUM PAID CERTIFICATE LIF...
3,data\4-1.jpg,4-1.jpg,jpg,"[Eroployee PD- 22886, Norme. Rajendhra Prasad ...",Eroployee PD- 22886 Norme. Rajendhra Prasad Ya...
4,data\5-1.jpg,5-1.jpg,jpg,"[HDFC BANK, Wernet send your world, HDFC Bank ...",HDFC BANK Wernet send your world HDFC Bank Lim...
5,data\6-1.jpg,6-1.jpg,jpg,"[SBI Life, INSURANCE, With Us, You're Sure, Re...","SBI Life INSURANCE With Us, You're Sure Renewa..."
6,data\7-1.jpg,7-1.jpg,jpg,"[LIC POLICY PROOF, Life Insurance Corporation ...",LIC POLICY PROOF Life Insurance Corporation of...
7,data\8-1.jpg,8-1.jpg,jpg,"[RAGHAVENDRAB, EC 13744, kotak life, 28-02-201...",RAGHAVENDRAB EC 13744 kotak life 28-02-2018 Pr...
8,data\Aegon-Life-.jpg,Aegon-Life-.jpg,jpg,"[AEGONLife, Mr Ashish Shrivastava, Veersawarka...",AEGONLife Mr Ashish Shrivastava Veersawarkar W...
9,data\bajaj.jpg,bajaj.jpg,jpg,"[Bajaj Allianz Life Insurance Company Limited,...",Bajaj Allianz Life Insurance Company Limited B...


# Helper Functions

In [8]:
 
def find_provider(corpus):
    insurance_providers = ['hdfc', 'lic','newindia', 'bajaj', 'birla', 'aegon','glic', 'sbi','pnb', 'exide','icici','max','kotak','bupa','apollo','bharti','magma','liberty','generali','oriental','star','royal','aig','reliance','universal','united','shriram','cigna','raheja','cholamandalam','ecgc','iffco','aviva','pnb','idbi','indiafirst','peerless','sahara','new india','religare','hsbc','dhfl','edelweiss','aia','dai-ichi','sundaram','l&t','new india','GLIC','star','http://newindia.co.in.']
    corpus_split = corpus.lower().split(' ')
    for provider in insurance_providers:
        if provider in corpus_split:
            return provider
        
    return None   
 
def find_provider_reg(corpus):
    regex=r"(Apollo Munich|Apollo Munich|SBI Life|SBI|sbi|EXIDE Life|ICICI PRUDENTIAL|ICICI Lombard|ICICI LOMBARD|Max Life|max life|HDFC Life|LIC of India|Life Insurance Corporation|LIFE INSURANCE CORPORATION|LIC|kotak life|Kotak Life|Kotak Mahindra Life|STAR COMPREHENSIVE INSURANCE|Star Health|NEW INDIA ASSURANCE|NEW INDIA|HDFC ERGO|HDFC ERGO General Insurance|Aegon Life|Bajaj Allianz|Birla Sun Life|Birla Sun Life Insurance)"
    matches = re.finditer(regex, corpus, re.MULTILINE)
    for match in matches:
        return match.group()
    return None



In [9]:
def find_policy(corpus):
    #regex = r" (Policy|policy|P.|p.|P|p|Pol|pol|Pol.|pol.|Master olicy)+\s(No|Num|No.|no.|num|Number|number) \d+"
    regex=r" (Policy|policy|P.|p.|P|p|Pol|pol|Pol.|pol.|Details|Proposal/Policy|Policy|)+\s(No|Num|No.|no.|num|Number|number)(\s|\s+|.|)(:|,|;|-|=|\s|) (\d\w\d+|\d+\w+\d+)"
    matches = re.finditer(regex, corpus, re.MULTILINE)
    for match in matches:
        return match.group()

def find_policy_slash(corpus):
    regex=r" (Policy.No....(\w+|\d+)(.|)(\w+|\d+)(.|)(\w+|\d+)(.|)(\w+|\d+)(.|)(\w+|\d+))"
    matches = re.finditer(regex, corpus, re.MULTILINE)
    for match in matches:
        return match.group()

In [10]:
def find_premium(rendered_text):
    regex = r"(Premium.A.|Premium Amount|Premium amount|Premium|Sum Assured|Total Premium Amount|Amount paid|Total Premium Paid|Gross Premium .Rs.|Gross)..(?: \d\d+)+ (?=\()|\d+,\d+.\d\d"
    for text in rendered_text:
        matches = re.match(regex, text)
        if matches:
            return text
            break
            
    return None

In [11]:
def find_name(rendered_text):
    regex = r"(Mr./Mrs.|Mr|MR|Mr.|Ms|Ms.|Miss|Mrs|Mrs.|Prof|Mr./Mrs.|Name:|Dear MR.|Prof.|Dr|Dr.|Smt.|Ms.|Shri|Sri|Sri.|MR |Policy Holder Name|Premium Payor Name|Policyholder's Name|Life Insured..|Owner Name...|Dear Mr.|following life insurance policy held by|The following premium has been received for life insurance policies from the userid of)(.|..|...|)((\w+ \w+ \w+ \w+)|(\w\w+)+|(\w \w+ \w+))"
    for text in rendered_text:
        matches = re.match(regex, text)
        if matches:
            return text
            break 
            
    return None

In [12]:
def find_date(rendered_text):
    regex = r"((Date|Date |DATE|date|Date of issue|DATE OF ISSUE)(\s|):+.(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+))"
    d = None
    for text in rendered_text:
        matches = re.match(regex, text)
        if matches:
            return text
        
    return None

In [13]:

def valid_premium_word(premium_word):
    try:
        mat=re.match("^((Premium.A.|Premium Amount|Premium amount|Installment premium|Installment Premium|Amt. Collected|Total Premium|Gross Premium .Rs.|Gross))$",premium_word)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def valid_premium_no(premium_no):
    try:
        mat=re.match("^(?: \d\d+)+ (?=\()|\d+|\d+,\d+.\d\d$",premium_no)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def find_premium_ex(corpus):
    words = corpus.split(' ')
    for i in range(0, len(words)):
        if words[i] == valid_premium_word(words[i]):
            for j in range(i, len(words)):
                if words[j] == valid_premium_no(words[j]):
                    return words[j]      
    return None
                

def find_premium_back(rendered_text):
    for i in range(0,len(rendered_text)):
        if rendered_text[i] == valid_premium_word(rendered_text[i]):
            k=i
            for j in range(k, 0, -1):
                 if rendered_text[j] == valid_premium_no(rendered_text[j]):
                        return rendered_text[j]
                        break
    return None

def prem_amt(corpus):
    regex = r"(Premium.A.|Premium Amount|Premium amount|Sum Assured|Total Premium Amount|Amount Paid|Total Premium Paid|Total|Total Amount Rs. |amount of|Amount paid|premium amount of Rs.|Total Premium :Rs|Amt. Collected) ..((\d+.\d\d)|(: \d+.\d\d)|(\d+,\d+.\d\d))"
    
    test_str = corpus
    matches = re.finditer(regex, test_str, re.MULTILINE)

    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        if match:
            return match.group()
    return None

def prem_amt1(corpus):
    regex = r"(Premium.A.|Premium Amount|Premium amount|Sum Assured|Total Premium Amount|Amount Paid|Total Premium Paid|Total|Total Amount Rs. |amount of|Amount paid|premium amount of Rs.|Amt. Collected|Total Premium :Rs|paid...) ((\d+,\d+.\d\d)|(\d+.\d\d)|(\d+))"
    
    test_str = corpus
    matches = re.finditer(regex, test_str, re.MULTILINE)

    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        if match:
            return match.group()
    return None

def prem_recpt(corpus):
    regex = r"(receipt[^\d]+\d+)"
    
    test_str = corpus
    matches = re.finditer(regex, test_str, re.MULTILINE)

    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        if match:
            return match.group()
    return None

In [14]:
def valid_policy_word(policy_word):
    try:
        mat=re.match("^((Policy|policy|P.|p.|P|p|Pol|pol|Pol.|pol.|Master|1 plicy|licy|olicy|icy)+\s(No|Num|No.|no.|num|Number|number|No.Plan|Policy :|Number:))$",policy_word)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def valid_policy_no(policy_no):
    try:
        mat=re.match("(\d\w\d+|\d+\w+\d+|^\d+$)",policy_no)
        #mat=re.match("^\d+$",policy_no)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def find_policy_ex(rendered_text):            
    for i in range(0, len(rendered_text)):
        if rendered_text[i] == valid_policy_word(rendered_text[i]):
            for j in range(i, len(rendered_text)):
                if rendered_text[j] == valid_policy_no(rendered_text[j]):
                    return rendered_text[j]
    return None


def find_policy_back(rendered_text):
    for i in range(0,len(rendered_text)):
        if rendered_text[i] == valid_policy_word(rendered_text[i]):
            k=i
            for j in range(k, 0, -1):
                 if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
                        break
    return None

def find_policy_sep_number(rendered_text):
    for i in range(0,len(rendered_text)):
        if rendered_text[i]=='Policy'and rendered_text[i+1]=='Number':
            for j in range(i, len(rendered_text)):
                    if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
        elif rendered_text[i]=='policy'and rendered_text[i+1]=='number':
            for j in range(i, len(rendered_text)):
                    if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
        elif rendered_text[i]=='Policy'and rendered_text[i+1]=='number':
            for j in range(i, len(rendered_text)):
                    if rendered_text[j] == valid_policy_no(rendered_text[j]):
                        return rendered_text[j]
    return None

def find_single_policy(corpus):
    for i in corpus:
        x=re.search("((Policy....)(\d+))",corpus)
        if x:
            return x.group()
            break
    return None

In [15]:
def valid_dateword(date_word):
    try:
        mat=re.match("^((Date|Date:|DATE|date|Date of issue|DATE OF ISSUE|Date and Time :|Receipt Date))$",date_word)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def valid_date(date_string):
    try:
        mat=re.match("^(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+)$",date_string)
        return mat.group()
    except ValueError:
        return False
    except AttributeError:
        return False

def find_date_ex(rendered_text):    
    for i in range(0,len(rendered_text)):
        if rendered_text[i]==valid_dateword(rendered_text[i]):
            for j in range(i, len(rendered_text)):
                if rendered_text[j] == valid_date(rendered_text[j]):
                    return rendered_text[j]
    return None 

def find_date_single(rendered_text):
    for i in rendered_text:
        x=re.search("^(0?[1-9]|1[0-9]|2[0-9]|3[0-1])([. \/-])(0?[1-9]|1[0-2]|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)|Apr(?:il)|May|Jun(?:ne)|Jul(?:y)|Aug(?:)|Sep(?:)|Oct(?:)|Nov(?:)|Dec(?:ember)?)|(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)|apr(?:il)|may|jun(?:ne)|jul(?:y)|aug(?:)|sep(?:)|Oct(?:)|nov(?:)|dec(?:ember)?))[(. \-/](19[0-9][0-9]|20[0-9][0-9])$",i)
        if x:
            return x.group()
        elif x is None: 
                x = re.search('(\w+)(\s)(\d+)(,)(\s|)(\d+)', i)
                if x:
                    return x.group()
    return None 

def find_if_date_and_time(corpus):
    for i in corpus:
        x=re.search("((Date...Time.|Receipt Date)(\s|)(|:|)+.(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+))",corpus)
        if x:
            return x.group()
            break
    return None

def find_if_date_time(corpus):
    for i in corpus:
        x=re.search("((Date and Time)(\s|)(:)(\s|)(\d+|\w+|\w)(-|/|\|.|,|\s)(\d+|\w+)(-|/|\|.|,|\s)(\d+|\w+))",corpus)
        if x:
            return x.group()
            break
    return None

# Main function

In [16]:
def information_extract(row):
    rendered = row.rendered_text
    corpus = row.corpus
    
    provider = find_provider_reg(corpus)
    if provider is None:
        find_provider(corpus)
    
    policy_number = find_policy(corpus)
    if policy_number is None:
        policy_number = find_policy_ex(rendered)
        policy_number=policy_number
        if policy_number is None:
            policy_number=find_policy_back(rendered)
            policy_number=policy_number
            if policy_number is None:
                policy_number = find_policy_sep_number(rendered)
                policy_number=policy_number
                if policy_number is None:
                    policy_number = find_single_policy(corpus)
                    policy_number=policy_number
                    if policy_number is None:
                        policy_number = find_policy_slash(corpus)
        
            
        
    premium_amt = find_premium(rendered)
    if premium_amt is None:
        premium_amt = find_premium_ex(corpus)
        premium_amt=premium_amt
        if premium_amt is None:
            premium_amt = prem_amt(corpus)
            premium_amt=premium_amt
            if premium_amt is None:
                premium_amt = prem_amt1(corpus)
                premium_amt=premium_amt
                if premium_amt is None:
                    premium_amt=find_premium_back(rendered)
                    premium_amt=premium_amt
                    if premium_amt is None:
                        premium_amt = prem_recpt(corpus)
            
        
    insured_name = find_name(rendered)
    # insured_name = natural_language(insured_name)
    
    
    premium_date = find_if_date_and_time(corpus)   
    premium_date=premium_date
    if premium_date is None:
        premium_date = find_date(rendered)
        premium_date=premium_date
        if premium_date is None:
            premium_date = find_date_ex(rendered)
            premium_date=premium_date
            if premium_date is None:
                premium_date=find_date_single(rendered)
                premium_date=premium_date
                if premium_date is None:
                    premium_date = find_if_date_time(corpus)
    
            
                

             
    return pd.Series({'provider':provider, 'policy_number':policy_number, 'premium_amt' : premium_amt, 'insured_name': insured_name, 'premium_date': premium_date})

In [17]:
df_final_result = pd.concat([df_result, df_result.apply(information_extract, axis=1)], axis=1)

In [18]:
df_final_result[['file_name', 'provider', 'policy_number' , 'premium_amt', 'insured_name', 'premium_date']]

Unnamed: 0,file_name,provider,policy_number,premium_amt,insured_name,premium_date
0,1-1.jpg,SBI Life,Policy : 70000018310,Total Premium Amount *14124,Mr. Venkateswariu Garlapati,Date : 13-JUL-17
1,2-1.jpg,EXIDE Life,Policy Number 01272332,Amount paid 74182.20,Mr. K Jaganath,Date: 19-09-2017
2,3-1.jpg,ICICI PRUDENTIAL,04844225,Total Premium Paid 0.00,Mr. Vaibhav Kumar Jain,Date : 14-Mar-2018
3,4-1.jpg,Max Life,Policy Number: 265269621,1211924.00,Mr. Rajendra Prasad Yadav Mobile No.,Date: 01-Jan-2018
4,5-1.jpg,HDFC Life,Details Number: 1200045384096,amount of '25000,Premium Payor Name: K Savita Prasad Subudhi,"February 7, 2018"
5,6-1.jpg,SBI Life,Policy Number : 1K045717606,Amount Paid : 30000.00,Policy Holder Name : MOHAPATRA SATHISH KUMAR,Date and Time : 27/02/2018
6,7-1.jpg,LIC,222238185,2586.00,The following premium has been received for li...,DATE OF ISSUE :06/03/2018
7,8-1.jpg,kotak life,Policy No: 01740250,16500,,28-02-2018
8,Aegon-Life-.jpg,Aegon Life,,8931.00,Mr Ashish Shrivastava,24/07/2016
9,bajaj.jpg,Bajaj Allianz,Policy Number 0098567825,5000,MR PONMANID,Date: 14 May 2008


In [19]:
words = ['http','Mrs.','Policy','Premium Payor','Number','Details','of issue','No.','No','Mr.','MR ','MR.','.co.in.','Owner Name','Receipt','Rs.','Dear',':','/Premium Paying','holder','Name','Life Insured','Premium Payor Name','Holder Name','Proposal/','Date','DATE','OF','ISSUE','Date','and','Time','DATE OF ISSUE','Mobile','Smt./Ms./Shri','Holder','The following premium has been received for life insurance policies from the userid of','following life insurance policy held by'] 
for i in words:
    df_final_result = df_final_result.replace(to_replace=i,value="",regex=True)
    

df_final_result['provider']=df_final_result['provider'].str.replace(r"[\(\)\{\}<>/]","")
df_final_result['policy_number']=df_final_result['policy_number'].str.replace(r"[\(\)\{\}<>|]","")
df_final_result['insured_name']=df_final_result['insured_name'].str.replace(r"[\(\)\{\}<>,']","")
df_final_result['premium_date']=df_final_result['premium_date'].str.replace(r"[\(\)\{\}<>]","")


k=0
for i in df_final_result['premium_amt']:
    if i != None:
        df_final_result.loc[k,'premium_amt']=re.sub('[^0-9,.]', "", df_final_result.loc[k,'premium_amt'])
    k=k+1


In [20]:
def trimAllColumns(df_final_result):
    trimStrings = lambda x: x.strip() if type(x) is str else x
    return df_final_result.applymap(trimStrings)

df_final_result = trimAllColumns(df_final_result)
df_final_result[['file_name', 'provider', 'policy_number' , 'premium_amt', 'insured_name', 'premium_date']]

Unnamed: 0,file_name,provider,policy_number,premium_amt,insured_name,premium_date
0,1-1.jpg,SBI Life,70000018310,14124.0,Venkateswariu Garlapati,13-JUL-17
1,2-1.jpg,EXIDE Life,01272332,74182.2,K Jaganath,19-09-2017
2,3-1.jpg,ICICI PRUDENTIAL,04844225,0.0,Vaibhav Kumar Jain,14-Mar-2018
3,4-1.jpg,Max Life,265269621,1211924.0,Rajendra Prasad Yadav,01-Jan-2018
4,5-1.jpg,HDFC Life,1200045384096,25000.0,K Savita Prasad Subudhi,"February 7, 2018"
5,6-1.jpg,SBI Life,1K045717606,30000.0,MOHAPATRA SATHISH KUMAR,27/02/2018
6,7-1.jpg,LIC,222238185,2586.0,Deepansh Arora,06/03/2018
7,8-1.jpg,kotak life,01740250,16500.0,,28-02-2018
8,Aegon-Life-.jpg,Aegon Life,,8931.0,Ashish Shrivastava,24/07/2016
9,bajaj.jpg,Bajaj Allianz,0098567825,5000.0,PONMANID,14 May 2008


In [21]:
for i in range(0,len(df_final_result)):
    try:
        df_final_result.loc[i,'premium_date'] = pd.to_datetime(df_final_result.loc[i,'premium_date'],dayfirst=True)
    except ValueError:
        df_final_result.loc[i,'premium_date'] = None

df_final_result['premium_date'] = pd.to_datetime(df_final_result['premium_date'],dayfirst=True).dt.date
df_final_result[['file_name', 'provider', 'policy_number' , 'premium_amt', 'insured_name', 'premium_date']]

Unnamed: 0,file_name,provider,policy_number,premium_amt,insured_name,premium_date
0,1-1.jpg,SBI Life,70000018310,14124.0,Venkateswariu Garlapati,2017-07-13
1,2-1.jpg,EXIDE Life,01272332,74182.2,K Jaganath,2017-09-19
2,3-1.jpg,ICICI PRUDENTIAL,04844225,0.0,Vaibhav Kumar Jain,2018-03-14
3,4-1.jpg,Max Life,265269621,1211924.0,Rajendra Prasad Yadav,2018-01-01
4,5-1.jpg,HDFC Life,1200045384096,25000.0,K Savita Prasad Subudhi,2018-02-07
5,6-1.jpg,SBI Life,1K045717606,30000.0,MOHAPATRA SATHISH KUMAR,2018-02-27
6,7-1.jpg,LIC,222238185,2586.0,Deepansh Arora,2018-03-06
7,8-1.jpg,kotak life,01740250,16500.0,,2018-02-28
8,Aegon-Life-.jpg,Aegon Life,,8931.0,Ashish Shrivastava,2016-07-24
9,bajaj.jpg,Bajaj Allianz,0098567825,5000.0,PONMANID,2008-05-14


In [22]:
df_final_result[['file_name', 'provider', 'policy_number' , 'premium_amt', 'insured_name', 'premium_date']].to_csv('output.csv', index=False)