In [1]:
import pandas as pd
import re
import numpy as np

def find_address(text_df):
    ''' Takes in a string of text, and searches for any possible address string.
    
    Args:
        text_df: dataframe of the text split by space
        
    Returns:
        address: first address found'''

    # Checking all 2 letter abbreviations 
    mask = text_df[0].str.contains(r'\A[A-Z]{2}\b') & text_df[0].isin(initials['Abbreviation'])
    inds = text_df[mask].index

    # For all 2 letter abbreviations, checking if the next "words" are zipcodes
    inds = inds + 1
    inds_mask = text_df.loc[inds, 0].str.contains(r'\d{5}')
    addr_inds = set(inds) & set(inds_mask[inds_mask].index)
    
    if len(addr_inds) == 0:
        return None
    
    # Searches for the beginning of the address by looking for a string of numbers
    addresses = []
    for end_ind in addr_inds:
        search_ind = end_ind - 10
        for ind in range(search_ind, end_ind):
            if text_df.loc[ind, 0].isdigit():
                start_ind = ind
        addresses.append(" ".join(text_df.loc[start_ind:end_ind, 0].values))
    address = addresses[0]
    return address

def find_order_no(text_df):
    ''' Finds the bigram `Title No.:` or `GF No.`, and extracts the string after it.
    
    Args:
        text_df: dataframe of the text split by space
        
    Returns:
        order_no: string of the title policy order number'''
    
    header = ['Title No.:', 'GF No.']
    text_df[1] = text_df[0].shift()
    text_df[1] = text_df[1] + " " + text_df[0]
    mask = text_df[1].isin(header)
    if np.sum(mask) == 0:
        return None
    else:
        ind = mask[mask].index[0]
        order_no = text_df.loc[ind+1, 0]
        return order_no

def find_cur_owner(line_text):
    '''Finds the current owner of the interest in the land.
    
    Args:
        line_text: dataframe of the text split by line
        
    Returns:
        cur_owner: the current owner of the interest in the land'''
    
    mask = [True] * line_text.shape[0]
    vested_strs = ['date', 'effective', 'land', 'title', 'vested']
    for word in vested_strs:
        mask = line_text['0'].str.contains(word, case=False) & mask
    
    if np.sum(mask) == 0:
        return None
    
    ind = mask[mask].index[0] + 1
    cur_owner = line_text.loc[ind, '0']
    return cur_owner

def find_eff_date(line_text):
    '''Finds the effective date of the policy. Takes the first line containing "effective" and "date", 
    and takes the rest of the string as the date.
    
    Args:
        line_text: dataframe of the text split by line
        
    Returns:
        eff_date: date string of the current effective date'''
    
    mask = [True] * line_text.shape[0]
    eff_strs = ['effective', 'date']
    for word in eff_strs:
        mask = line_text['0'].str.contains(word, case=False) & mask
    
    if np.sum(mask) == 0:
        return None
    
    eff_date = line_text[mask].iloc[0, 0].split(":")[1].strip()
    return eff_date

def find_comp_name(line_text):
    '''Finds the title company name.
    
    Args:
        line_text: dataframe of the text split by line
    
    Returns:
        comp_name: name of the title company'''
    
    clean_line_text = line_text['0'].str.replace(r'\s+', ' ')

    # Get string containing the company name
    mask = [True] * clean_line_text.shape[0]
    eff_strs = ['title company', 'insurance company']
    mask = clean_line_text.str.contains("|".join(eff_strs), case=False)
    
    if np.sum(mask) == 0:
        return None
    
    # Inside string, look for bigram 
    comp_string = clean_line_text[mask].iloc[0]
    comp_df = pd.DataFrame(comp_string.split(" "))
    comp_df[1] = comp_df[0].shift()
    comp_df[1] = comp_df[1] + " " + comp_df[0]
    mask = comp_df[1].str.contains("|".join(eff_strs), case=False)
    
    # After finding bigram, look for the other words with first letter capped
    ind = mask[mask.fillna(value=False)].index[0]
    for word_ind in range(ind-5, ind-1)[::-1]:
        word = comp_df.loc[word_ind, 0]
        if not word[0].isupper():
            comp_name = " ".join(comp_df.loc[word_ind+1:ind, 0].values)
            return comp_name
        
def find_title_info(text):
    raw_text = " ".join(text['0'].values)
    raw_text = re.sub(r'\s+', ' ', raw_text)
    text_df = pd.DataFrame(raw_text.split(" "))

    title_info = {}
    title_info['address'] = find_address(text_df)
    title_info['order_no'] = find_order_no(text_df)
    title_info['cur_owner'] = find_cur_owner(text)
    title_info['eff_date'] = find_eff_date(text)
    title_info['comp_name'] = find_comp_name(text)
    
    return title_info

In [2]:
initials = pd.read_csv('StateInitials.csv')

scha = pd.read_csv('df_SCHEDULE A.csv', index_col = 0)
le = pd.read_csv('df_LEGAL DESCRIPTION.csv', index_col = 0)
text = pd.concat([scha, le])

find_title_info(text)

{'address': '13209 N 129th Drive, El Mirage, AZ 85335',
 'comp_name': 'Chicago Title Insurance Company',
 'cur_owner': '2013-1 IH BORROWER L.P., A DELAWARE LIMITED PARTNERSHIP',
 'eff_date': '9th  day  of  March,  2017',
 'order_no': 'AZP'}