In [None]:
import pandas as pd

# Add the parent directory of the 'utils' directory to the Python path
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


import utils.utils as u
import utils.utils_people as up
import utils.utils_entities as ue

-------------------------------------------------------------------------------------------------------------------

# Assignation of Owners from model output

-------------------------------------------------------------------------------------------------------------------

Delete `model_output` directory

In [None]:
model_output = 'model_output_step6.json'
if os.path.exists(model_output):
    os.remove(model_output)
    

#### Load dataset of unresolved entries

In [None]:
dataset_DF = pd.read_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step5.json')

dataset_unresolved_DF = dataset_DF[dataset_DF['owner_code'] == 'TODO'].copy()
dataset_unresolved_DF.loc[:, 'owner_name'] = dataset_unresolved_DF['owner_name'].map(lambda x: u.text_to_minimal((x)))
print("There are", len(dataset_unresolved_DF), "unresolved parcel entries.")

In [None]:
dataset_unresolved_DF.head()

#### Load patterns from model output

In [None]:
unresolved_patterns_DF = pd.read_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step6_patterns.json')
unresolved_patterns_DF

#### Load necessary dictionaries

In [None]:
family_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/family_names.json')[0].to_list(), key=len, reverse=True)
family_names_comp = sorted(pd.read_json('../dictionaries/PPL_dictionary/family_names_comp.json')[0].to_list(), key=len, reverse=True)
first_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/first_names.json')[0].to_list(), key=len, reverse=True)
mestieri = sorted(pd.read_json('../dictionaries/PPL_dictionary/mestieri.json')[0].to_list(), key=len, reverse=True)
titles = sorted(pd.read_json('../dictionaries/PPL_dictionary/titles.json')[0].to_list(), key=len, reverse=True)
unknown_relatives = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives.json')[0].to_list(), key=len, reverse=True)
unknown_relatives_sing = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives_sing.json')[0].to_list(), key=len, reverse=True)
unknown_relatives_plur = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives_plur.json')[0].to_list(), key=len, reverse=True)

venezia_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/venezia_entities.json')[0].to_list(), key=len, reverse=True)
venezia_titles_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/venezia_titles_entities.json')[0].to_list(), key=len, reverse=True)
guild_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/guild_entities.json')[0].to_list(), key=len, reverse=True)
jew_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/jew_entities.json')[0].to_list(), key=len, reverse=True)
religious_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/religious_entities.json')[0].to_list(), key=len, reverse=True)
religious_titles_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/religious_titles_entities.json')[0].to_list(), key=len, reverse=True)
scuole_grandi_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/scuole_grandi_entities.json')[0].to_list(), key=len, reverse=True)
scuole_mestieri_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/scuole_mestieri_entities.json')[0].to_list(), key=len, reverse=True)
scuole_religious_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/scuole_religious_entities.json')[0].to_list(), key=len, reverse=True)
social_care_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/social_care_entities.json')[0].to_list(), key=len, reverse=True)
entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/entities.json')[0].to_list(), key=len, reverse=True)

unlinked = pd.read_json('../dictionaries/ENT_dictionary/unlinked_entities.json')
venezia_entities_unlinked = sorted(unlinked[unlinked['entities_type'] == 'venezia_entities']['entities_unlinked'], key=len, reverse=True)[0]
religious_entities_unlinked = sorted(unlinked[unlinked['entities_type'] == 'religious_entities']['entities_unlinked'], key=len, reverse=True)[0]
religious_titles_entities_unlinked = sorted(unlinked[unlinked['entities_type'] == 'religious_titles_entities']['entities_unlinked'], key=len, reverse=True)[0]
scuole_entities_unlinked = sorted(unlinked[unlinked['entities_type'] == 'scuole_entities']['entities_unlinked'], key=len, reverse=True)[0]
social_care_entities_unlinked = sorted(unlinked[unlinked['entities_type'] == 'social_care_entities']['entities_unlinked'], key=len, reverse=True)[0]
other = sorted(unlinked[unlinked['entities_type'] == 'other']['entities_unlinked'], key=len, reverse=True)[0]

unlinked_entities = (venezia_entities_unlinked + religious_entities_unlinked + 
                     religious_titles_entities_unlinked + scuole_entities_unlinked + 
                     social_care_entities_unlinked + other)

ignore_next = sorted(pd.read_json('../dictionaries/MSC_dictionary/ignore_next.json')[0].to_list(), key=len, reverse=True) 
cities = sorted(pd.read_json('../dictionaries/MSC_dictionary/cities.json')[0].to_list(), key=len, reverse=True)
collisions = [c for c in family_names if (c in titles or c in mestieri or c in entities)]
places = sorted(pd.read_json('../dictionaries/MSC_dictionary/places.json')[0].to_list(), key=len, reverse=True)

## Start parcel assignation

In [None]:
modifications_to_commit = []

#### Define a non-strict is_consecutive function

This function checks that two words are consecutive.

It is non strict since it leaves a 10 character distance between the words to account for around 1 possible word (e.g. title, mestiere, etc.) in between.

In [None]:
def is_consecutive(owner_end_idx, pattern_start_idx):
    # first assignation
    if owner_end_idx == -1:
        return True
    
    return abs(pattern_start_idx - owner_end_idx) <= 10 # one word in the middle allowed

#### Update owner function

This function assigns the patterns found for each owner in the entry. It is called in the main loop for each owner found by the model.

In [None]:
def update_owner(owner, pattern, pattern_label, pattern_start_idx, pattern_end_idx):
    
    owner_pattern = owner['owner']
    owner_label = owner['owner_label']
    
    pattern_is_owner = (  
        pattern_label == 'ENT'
        or pattern_label == 'LAST_NAME' 
        or pattern_label == 'FIRST_NAME' 
        or pattern_label == 'FIRST_AND_LAST_NAME'
    )
    
    # ======== INIT ========
    if owner_label == '-':
        if pattern_is_owner:
            owner['owner'] = pattern
            owner['owner_label'] = pattern_label
            owner['owner_start_idx'] = pattern_start_idx
            owner['owner_end_idx'] = pattern_end_idx
        
        elif pattern_label == 'TITLE':
            if owner['title'] == '-':
                owner['title'] = pattern
            else:
                owner['title'] = f"{owner['title']} & {pattern}"
        
        elif pattern_label == 'MESTIERE':
            if owner['mestiere'] == '-':
                owner['mestiere'] = pattern
            else:
                owner['mestiere'] = f"{owner['mestiere']} & {pattern}"

        return owner
    
    # ======== ENTITY ========
    if owner_label == 'ENT':
        return owner
    
    # ======== TITLE ========
    if pattern_label == 'TITLE':
        if owner['title'] == '-':
            owner['title'] = pattern
        else:
            owner['title'] = f"{owner['title']} & {pattern}"
        return owner
    
    # ======== MESTIERE ========
    if pattern_label == 'MESTIERE':
        if owner['mestiere'] == '-':
            owner['mestiere'] = pattern
        else:
            owner['mestiere'] = f"{owner['mestiere']} & {pattern}"
        return owner
    
    # ======== FIRST NAME ========
    if owner_label == 'FIRST_NAME':
        
        if pattern_label == 'FIRST_NAME':
            if is_consecutive(owner['owner_end_idx'], pattern_start_idx):
                owner['owner'] = f"{owner_pattern} {pattern}"
                owner['owner_end_idx'] = pattern_end_idx
                return owner
            
        elif (pattern_label == 'LAST_NAME' or pattern_label == 'FIRST_AND_LAST_NAME'):
            if is_consecutive(owner['owner_end_idx'], pattern_start_idx):
                owner['owner'] = f"{owner_pattern} {pattern}"
                owner['owner_end_idx'] = pattern_end_idx
                owner['owner_label'] = 'FIRST_AND_LAST_NAME'
                return owner
    
    # ======== LAST NAME ========
    elif owner_label == 'LAST_NAME':
        
        if pattern_label == 'FIRST_NAME':
            if is_consecutive(owner['owner_end_idx'], pattern_start_idx):
                owner['owner'] = f"{owner_pattern} {pattern}"
                owner['owner_end_idx'] = pattern_end_idx
                owner['owner_label'] = 'FIRST_AND_LAST_NAME'
                return owner
        
        elif pattern_label == 'LAST_NAME':
            if is_consecutive(owner['owner_end_idx'], pattern_start_idx):
                owner['owner'] = f"{owner_pattern} {pattern}"
                owner['owner_end_idx'] = pattern_end_idx
                return owner
        
        elif pattern_label == 'FIRST_AND_LAST_NAME':
            if is_consecutive(owner['owner_end_idx'], pattern_start_idx):
                owner['owner'] = f"{owner_pattern} {pattern}"
                owner['owner_end_idx'] = pattern_end_idx
                owner['owner_label'] = 'FIRST_AND_LAST_NAME'
                return owner
            
    # ======== FIRST AND LAST NAME ========  
    elif owner_label == 'FIRST_AND_LAST_NAME':
        
        if pattern_label == 'LAST_NAME' or pattern_label == 'FIRST_NAME':
            if is_consecutive(owner['owner_end_idx'], pattern_start_idx):
                owner['owner'] = f"{owner_pattern} {pattern}"
                owner['owner_end_idx'] = pattern_end_idx
                return owner
    
    return None
    

In [None]:
count_resolved = 0

#### Pattern separator function

This function separates the patterns found when it find a "separator". A separator indicates that more people or entities own this parcel.

In [None]:
def pattern_separator(patterns):
    separator_idxs = [idx for idx, obj in enumerate(patterns) if obj['label'] == "SEPARATOR"]
    ranges = [0] + separator_idxs + [len(patterns)]
    intervals = [(ranges[i], ranges[i+1]) for i in range(len(ranges)-1)]
    separated_patterns = [patterns[i:j] for i, j in intervals]
    
    patterns_res = []
    for ps in separated_patterns:
        ppo = [p for p in ps if p['label'] != 'SEPARATOR']
        patterns_res.append(ppo)
        
    return patterns_res

#### First name correction for related owners

Some first names could be assigned as family names (and viceversa) by the model when these names are overlapping (both in first names and family names). This function understands if this overlapping should yield a first name or a family name.

In [None]:
def adjust_first_name_for_related_owners(owners):
    if len(owners) == 1:
        return owners
    
    for i, current_owner in enumerate(owners):
        if current_owner['owner_label'] == 'LAST_NAME' and current_owner['owner'] in first_names:
            for later_owner in owners[i+1:]:
                if later_owner['owner_label'] in ['LAST_NAME', 'FIRST_AND_LAST_NAME']:
                    current_owner['owner_label'] = 'FIRST_NAME'
                    break

    return owners


#### Assignation

This function loops through all the unresolved entries and calls `update_owner()` to create owner objects. 

Only parcels that have 4 or less owners are considered as parcels with more than 4 owners are very rare and cause problems because of their frequent irregularity. We thus leave all entries with more than 4 owners as TODO (by hand) for sureness.

In [None]:
for index, row in dataset_unresolved_DF.iterrows():
    
    uidx = row['uidx']
    if uidx not in unresolved_patterns_DF['uidx'].values:
        continue
    
    print('===================================================')
    print("UIDX", row['uidx'])
    print("TEXT", row['owner_name'])
    
    pattern_row = unresolved_patterns_DF[unresolved_patterns_DF['uidx'] == uidx].iloc[0]
    num_of_owners = sum(1 for pattern in pattern_row['patterns'] if pattern['label'] == 'SEPARATOR') + 1
    # only manage when number of owners is less than or equal to 4
    if num_of_owners > 4:
        continue
    
    owners = []
    idx_in_parcel = 0
    
    unknown_relative_pattern = ''
    unknown_relative_owner_count = ''
    raw_patterns = []
    for i, p_ in enumerate(pattern_row['patterns']):
        p = p_
        if p['label'] == 'UNKNOWN_RELATIVES_P' or p['label'] == 'UNKNOWN_RELATIVES_S':
            if i > 0 and pattern_row['patterns'][i-1]['label'] == 'SEPARATOR':
                unknown_relative_pattern = f"_{p['pattern']}"
                unknown_relative_owner_count = '2+' if p['label'] == 'UNKNOWN_RELATIVES_P' else '1'
        else:
            raw_patterns.append(p)
    
    patterns_per_owner = pattern_separator(raw_patterns)
    print("--------------- owners ---------------")
    
    for ppo in patterns_per_owner:
        print('------- owner patterns -------')
            
        owner = { "idx_in_parcel": '-', "owner_label": '-', "owner": '-', "owner_start_idx": -1, "owner_end_idx": -1, "title": '-', "mestiere": '-' }
        
        for p in ppo:  
            print("•", p)
            res = update_owner(owner, p['pattern'], p['label'], p['start_idx'], p['end_idx'])
            if res is not None:
                owner = res
    
        owner['idx_in_parcel'] = idx_in_parcel
        
        if owner['owner_label'] != '-':
            owners.append(owner)
            idx_in_parcel += 1
        
    if len(owners) > 0:
        owners = adjust_first_name_for_related_owners(owners)
        p = {
            "uidx": uidx,
            "owner_text": pattern_row['owner_text'],
            "owners": owners,
            "unknown_relative_owner": { "pattern": unknown_relative_pattern, "owner_count_remark": unknown_relative_owner_count  }
        }

        count_resolved += 1
        modifications_to_commit.append(p)
        
print()
print()
print("------------------------------------------------------------")
print("Total number of resolved parcels in this assignation: ", count_resolved)

#### Utils for standardisation

In [None]:
def remove_extra_separators(s_):
    s = s_
    if ' | ' in s[-len(' | '):]:
        s = s[:-len(' | ')]
    return u.remove_extra_spaces(s)   

def remove_extra_name_separators(s_):
    s = s_
    if '| - ' in s[-len('| - '):]:
        s = s[:-len('| - ')]
    
    s = remove_extra_separators(s)
    return u.remove_extra_spaces(s)

def clean_codes(s_):
    s = remove_extra_separators(s_)
    s_clean = [u.remove_extra_spaces(c) for c in s.split('|')]
    codes = list(set(s_clean))
    if len(codes) == 1:
        return codes[0]
    else:
        return ' | '.join(s_clean)
    
def clean_first_names(s_):
    s = remove_extra_name_separators(s_)
    if len(u.remove_extra_spaces(s.replace('-', '').replace('|', '').replace(' ', ''))) == 0:
        return ''
    
    fns = [u.remove_extra_spaces(fn) for fn in s.split('|')]
    first_names = []
    for i in range(len(fns)):
        fn = fns[i].split(' ')
        if fn[0].startswith('_'):
            first_names.append('_'.join(fn))
        else:
            fn = [f.capitalize() for f in fn]
            first_names.append(' '.join(fn))

    return u.remove_extra_spaces(' | '.join(first_names))
    
def clean_family_names(s_):
    s = remove_extra_name_separators(s_)
    if len(u.remove_extra_spaces(s.replace('-', '').replace('|', '').replace(' ', ''))) == 0:
        return ''
    
    last_names = [u.remove_extra_spaces(ln).upper() for ln in s.split('|')]

    return u.remove_extra_spaces(' | '.join(last_names))

def clean_entities(s_):
    s = remove_extra_name_separators(s_)
    if len(u.remove_extra_spaces(s.replace('-', '').replace('|', '').replace(' ', ''))) == 0:
        return ''
    
    last_names = [u.remove_extra_spaces(ln).upper() for ln in s.split('|')]
    return u.remove_extra_spaces(' | '.join(last_names))

def clean_titles_or_mestieri(s_):
    s = remove_extra_name_separators(s_)
    if len(s.replace('-', '').replace('|', '').replace(' ', '')) == 0:
        return ''

    return u.remove_extra_spaces(s)

#### Utils for name and count fixing

These utils enable to fix "omitted" information in the owner entry. For instance, adjust owner count when plural names are mentioned or propagate last names through family members for which family name was not repeated.

In [None]:
def find_corresponding_family_name(idx, lns, accept_previous=False):
    start_idx = idx + 1
    if accept_previous:
        start_idx = 0
    for i in range(start_idx, len(lns)):
        if lns[i] != '-':
            return lns[i]
    return None
    
def assign_common_last_name_to_family_members(first_names, last_names):
    fns = clean_first_names(first_names)
    fns = [u.remove_extra_spaces(fn) for fn in fns.split('|')]
    
    lns = clean_family_names(last_names)
    lns = [u.remove_extra_spaces(ln) for ln in lns.split('|')]
    
    if len(fns) != len(lns):
        return None
    
    for i in range(len(fns)):
        if fns[i] != '-' and lns[i] == '-':
            family_name = find_corresponding_family_name(i, lns, accept_previous=False)
            if family_name is not None:
                lns[i] = family_name
            else:
                family_name = find_corresponding_family_name(i, lns, accept_previous=True)
                if family_name is not None:
                    lns[i] = family_name
    return ' | '.join(lns)

def adjust_owner_count_per_unknown_relative(owner_count, add_count, o_first_names, o_last_names, o_titles, pattern):
    o_count = owner_count
    o_count_remark = ''
    o_fns = o_first_names
    o_lns = o_last_names
    o_ttls = o_titles
    
    # case (e.g. fratello/i) in the middle of first name and last name
    if o_fns.endswith('- | '):
        o_fns = o_fns[:-len('- | ')]
        if add_count == '2+':
            o_count = o_count + 1 
            o_count_remark = f"{o_count}+"
            
    # case (e.g. fratello/i) at the end of owner text
    else:
        o_lns += '- | '
        if add_count == '2+':
            o_count = o_count + 2 
            o_ttls += '- | '
        elif add_count == '1':
            o_count = o_count + 1
            o_ttls += '- | '
            
        o_count_remark = f"{o_count}+"
        
    o_fns += f"{pattern} | "
    return { "owner_count": o_count, "owner_count_remark": o_count_remark, "o_first_names": o_fns, "o_last_names": o_lns, "o_titles": o_ttls }

In [None]:
for m in modifications_to_commit:
    m_uidx = m['uidx']
    
    owner_count = len(m['owners'])
    owner_count_remark = ''
    owner_last_names = ''
    owner_first_names = ''
    owner_codes = ''
    owner_titles = ''
    owner_mestieri = ''
    owner_entities = ''
    
    for o in m['owners']:

        if o['owner_label'] == 'ENT':
            e_code = ue.get_entity_code_from_mention(o['owner'].lower(), guild_entities, jew_entities, other, religious_entities_unlinked, religious_entities, religious_titles_entities_unlinked, religious_titles_entities, scuole_entities_unlinked, scuole_grandi_entities, scuole_mestieri_entities, scuole_religious_entities, social_care_entities_unlinked, social_care_entities, venezia_entities, venezia_titles_entities, venezia_entities_unlinked)
            owner_codes += f"{e_code}_m | "
            
            if ue.is_title_code(e_code):
                owner_titles += o['owner'] + ' | '
        else:
            owner_codes += 'PPL_m | '
            
        if ue.is_entity_code(o['owner_label']):
            owner_entities += o['owner'] + ' | '
            continue
        
        if o['owner_label'] == 'LAST_NAME':
            owner_text_minimal = o['owner']
            name_extraction = up.extract_names(
                owner_text_minimal, 
                family_names, 
                first_names, 
                collisions,
                ignore_next,
                cities,
                places
            )
            
            if (
                name_extraction is None
                or 'VRF' in name_extraction['label']
                or name_extraction['name_labeled_map'].get('LAST_NAME') is None 
                or len(name_extraction['name_labeled_map']['LAST_NAME']) == 0):
                owner_codes = owner_codes[:-len('PPL_m | ')]
                owner_codes += 'PPL_VRF | '
                owner_last_names += f"{o['owner']} | "
            else:
                owner_last_names += f"{name_extraction['name_labeled_map']['LAST_NAME']} | "
                
            owner_first_names += '- | '

        elif o['owner_label'] == 'FIRST_NAME':
            owner_first_names += f"{o['owner']} | "    
            owner_last_names += '- | '
            
        elif o['owner_label'] == 'FIRST_AND_LAST_NAME':
            owner_text_minimal = o['owner']
            name_extraction = up.extract_names(
                owner_text_minimal, 
                family_names, 
                first_names, 
                collisions,
                ignore_next,
                cities,
                places
            )
            
            if name_extraction is not None:
                fn_is_empty = name_extraction['name_labeled_map'].get('FIRST_NAME') is None or len(name_extraction['name_labeled_map']['FIRST_NAME']) == 0
                ln_is_empty = name_extraction['name_labeled_map'].get('LAST_NAME') is None or len(name_extraction['name_labeled_map']['LAST_NAME']) == 0
                fn = '-' if fn_is_empty else name_extraction['name_labeled_map']['FIRST_NAME']
                ln = '-' if ln_is_empty else name_extraction['name_labeled_map']['LAST_NAME']
                if name_extraction['label'] == 'PPL_VRF':
                    owner_codes = owner_codes[:-len('PPL_m | ')]
                    owner_codes += 'PPL_VRF | '
                
                owner_first_names += f"{fn} | "
                owner_last_names += f"{ln} | "

        if o['title'] != '-':
            owner_titles += o['title'].upper() + ' | '
        else:
            owner_titles += '- | '
            
        if o['mestiere'] != '-':
            owner_mestieri += o['mestiere'].upper() + ' | '
        else:
            owner_mestieri += '- | '
    
    if 'PPL' in owner_codes and not ue.entity_in_owner_codes(owner_codes):
        add_count = m['unknown_relative_owner']['owner_count_remark']
        if len(add_count) > 0:
            adjusted_text_per_unknown_relative = adjust_owner_count_per_unknown_relative(
                owner_count, 
                add_count, 
                owner_first_names, 
                owner_last_names, 
                owner_titles,
                m['unknown_relative_owner']['pattern'])
            owner_count = adjusted_text_per_unknown_relative['owner_count']
            owner_count_remark = adjusted_text_per_unknown_relative['owner_count_remark']
            owner_first_names = adjusted_text_per_unknown_relative['o_first_names']
            owner_last_names = adjusted_text_per_unknown_relative['o_last_names']
            owner_titles = adjusted_text_per_unknown_relative['o_titles']
            
     
        # manage brothers and sisters common family names
        adjusted_last_names = assign_common_last_name_to_family_members(owner_first_names, owner_last_names)
        if adjusted_last_names is not None:
            owner_last_names = adjusted_last_names
    
    dataset_DF.loc[m_uidx, 'owner_count'] = owner_count
    dataset_DF.loc[m_uidx, 'owner_count_remark'] = owner_count_remark
    dataset_DF.loc[m_uidx, 'owner_code'] = clean_codes(owner_codes)
    dataset_DF.loc[m_uidx, 'owner_family_name'] = clean_family_names(owner_last_names)
    dataset_DF.loc[m_uidx, 'owner_first_name'] = clean_first_names(owner_first_names)
    dataset_DF.loc[m_uidx, 'owner_title'] = clean_titles_or_mestieri(owner_titles)
    dataset_DF.loc[m_uidx, 'owner_mestiere'] = clean_titles_or_mestieri(owner_mestieri)
    dataset_DF.loc[m_uidx, 'owner_entity'] = clean_entities(owner_entities)
    

### ⚠️⚠️⚠️ Write the updated dataset to file

In [None]:
dataset_DF.to_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step7.json', orient='records', index=['uidx'])

----------------------------------------------------------

In [None]:
import time
print("Current time:", time.strftime("%H:%M:%S", time.localtime()))