In [None]:
import pandas as pd

# Add the parent directory of the 'utils' directory to the Python path
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


import utils.utils as u
import utils.utils_people as up

-------------------------------------------------------------------------------------------------------------------

# Assignation of parcels owned by People

-------------------------------------------------------------------------------------------------------------------

#### Load dataset

In [None]:
dataset_DF = pd.read_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step4.json')

#### Select TODO entries and format

In [None]:
dataset_unresolved_DF = dataset_DF[dataset_DF['owner_code'] == 'TODO'].copy()
dataset_unresolved_DF.loc[:, 'owner_name'] = dataset_unresolved_DF['owner_name'].map(lambda x: u.text_to_minimal((x)))
print("There are", len(dataset_unresolved_DF), "unresolved parcel entries.")

In [None]:
dataset_unresolved_DF.head()

#### Ignore entries with multiple owners

We ignore these entries since they will be managed by a future stage of the pipeline.

In [None]:
separators = sorted(pd.read_json('../dictionaries/MSC_dictionary/separators.json')[0].to_list(), key=len, reverse=True)
dataset_unresolved_DF = dataset_unresolved_DF[dataset_unresolved_DF['owner_name'].apply(lambda x: not u.has_multiple_owners(x, separators, use_minimal=False))]
print("There are", len(dataset_unresolved_DF), "relevant parcel entries.")

#### Load necessary dictionaries

In [None]:
family_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/family_names.json')[0].to_list(), key=len, reverse=True)
family_names_comp = sorted(pd.read_json('../dictionaries/PPL_dictionary/family_names_comp.json')[0].to_list(), key=len, reverse=True)
first_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/first_names.json')[0].to_list(), key=len, reverse=True)
mestieri = sorted(pd.read_json('../dictionaries/PPL_dictionary/mestieri.json')[0].to_list(), key=len, reverse=True)
titles = sorted(pd.read_json('../dictionaries/PPL_dictionary/titles.json')[0].to_list(), key=len, reverse=True)
titles_plur = sorted(pd.read_json('../dictionaries/PPL_dictionary/titles_plur.json')[0].to_list(), key=len, reverse=True)
unknown_relatives = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives.json')[0].to_list(), key=len, reverse=True)
unknown_relatives_sing = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives_sing.json')[0].to_list(), key=len, reverse=True)
unknown_relatives_plur = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives_plur.json')[0].to_list(), key=len, reverse=True)
ignore_next = sorted(pd.read_json('../dictionaries/MSC_dictionary/ignore_next.json')[0].to_list(), key=len, reverse=True)
entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/entities.json')[0].to_list(), key=len, reverse=True)
cities = sorted(pd.read_json('../dictionaries/MSC_dictionary/cities.json')[0].to_list(), key=len, reverse=True)
collisions = [c for c in family_names if (c in titles or c in mestieri or c in entities)]
places = sorted(pd.read_json('../dictionaries/MSC_dictionary/places.json')[0].to_list(), key=len, reverse=True)

## Start parcel assignation

In [None]:
def print_commit(uidx, owner):
    print('----------------------------------------------------------')
    print("UIDX", uidx)
    print("DEMANIO TEXT  -", owner.upper())

In [None]:
modifications_to_commit = []

In [None]:
for index, row in dataset_unresolved_DF.iterrows():
    
    uidx = row['uidx']
    owner_text_minimal = row['owner_name']

    name_extraction = up.extract_names(
        owner_text_minimal,
        family_names,
        first_names, 
        collisions,
        ignore_next,
        cities,
        places
    )
    
    if name_extraction is not None:
        fn_is_empty = name_extraction['name_labeled_map'].get('FIRST_NAME') is None or len(name_extraction['name_labeled_map']['FIRST_NAME']) == 0
        ln_is_empty = name_extraction['name_labeled_map'].get('LAST_NAME') is None or len(name_extraction['name_labeled_map']['LAST_NAME']) == 0
        fn = '' if fn_is_empty else name_extraction['name_labeled_map']['FIRST_NAME']
        ln = '' if ln_is_empty else name_extraction['name_labeled_map']['LAST_NAME']
        
        owner = {
            'uidx': uidx, 
            'owner_code': name_extraction['label'],
            'owner_count': 1,
            'owner_count_remark': '',
            'owner_first_name': fn,
            'owner_family_name': ln,
        }
                
        unknown_relative_extraction = up.extract_unknown_relative_owners(owner_text_minimal, unknown_relatives_sing, unknown_relatives_plur)
        if not unknown_relative_extraction is None:
            owner['owner_count'] = unknown_relative_extraction['owner_count']
            owner['owner_count_remark'] = unknown_relative_extraction['owner_count_remark']
            
            fn_ = f"_{unknown_relative_extraction['unknown_relative']}" if fn_is_empty else f"{fn} | _{unknown_relative_extraction['unknown_relative']}"
            ln_ = ln if fn_is_empty else f"{ln} | {ln}"
            owner['owner_first_name'] = fn_
            owner['owner_family_name'] = ln_
        
        modifications_to_commit.append(owner)
        print("UIDX", uidx, '----------------------------------------------------------')
        print("OWNER TEXT  -", owner_text_minimal)
        print("FIRST NAME  -", owner['owner_first_name'])
        print("LAST NAME   -", owner['owner_family_name'])
        print()

print()
print()
print("There are", len(modifications_to_commit), "total modifications.")

### Commit modifications in dataset

In [None]:
for m in modifications_to_commit:
    m_uidx = m['uidx']
    
    dataset_DF.loc[m_uidx, 'owner_code'] = m['owner_code']
    dataset_DF.loc[m_uidx, 'owner_count'] = m['owner_count']
    dataset_DF.loc[m_uidx, 'owner_count_remark'] = m['owner_count_remark']
    dataset_DF.loc[m_uidx, 'owner_first_name'] = m['owner_first_name']
    dataset_DF.loc[m_uidx, 'owner_family_name'] = m['owner_family_name']

## Title Assignation

In [None]:
dataset_people = dataset_DF[(dataset_DF['owner_code'] == 'PPL') | (dataset_DF['owner_code'] == 'PPL_VRF')]

In [None]:
titles_to_commit = []
for index, row in dataset_people.iterrows():
    uidx = row['uidx']
        
    owner_text = u.remove_extra_spaces(row['owner_name'])
    owner_text_minimal = u.text_to_minimal(owner_text)
    owner_count = 1
    owner_count_remark = ''
    
    ttls = []
    for t in titles:
        if u.string_contains_substring(owner_text_minimal, t, use_minimal=False):
            owner_text_minimal = owner_text_minimal.replace(t, '')
            ttls.append(t)
            if t in titles_plur:
                empty_title_entry = '-'
                owner_count = 2
                owner_count_remark = '2+'
            
    if len(ttls) > 0:
        title = ' & '.join(ttls)
        titles_to_commit.append({
            'uidx': uidx, 
            'owner_title': title.upper(),
            'owner_count': owner_count,
            'owner_count_remark': owner_count_remark
        })
        

        print('----------------------------------------------------------')
        print("UIDX", uidx)
        print("TITLE -", title.upper())

print("There are", len(titles_to_commit), "total titles added.")

### Commit modifications in dataset

In [None]:
for m in titles_to_commit:
    dataset_DF.loc[m['uidx'], 'owner_title'] = m['owner_title']
    if len(m['owner_count_remark']) > 0:
        dataset_DF.loc[m['uidx'], 'owner_count'] = m['owner_count']
        dataset_DF.loc[m['uidx'], 'owner_count_remark'] = m['owner_count_remark']

## Mestieri Assignation

In [None]:
mestieri_to_commit = []
for index, row in dataset_people.iterrows():
    uidx = row['uidx']
        
    owner_text = u.remove_extra_spaces(row['owner_name'])
    owner_text_minimal = u.text_to_minimal(owner_text)
    
    mstr = []
    for m in mestieri:
        if u.string_contains_substring(owner_text_minimal, m, use_minimal=False):
            owner_text_minimal = owner_text_minimal.replace(m, '')
            mstr.append(m)
            
    if len(mstr) > 0:
        mestiere = ' & '.join(mstr)
        mestieri_to_commit.append({
            'uidx': uidx, 
            'owner_mestiere': mestiere.upper()
        })

        print('----------------------------------------------------------')
        print("UIDX", uidx)
        print("MESTIERE -", mestiere.upper())

print("There are", len(mestieri_to_commit), "total mestieri added.")

### Commit modifications in dataset

In [None]:
for m in mestieri_to_commit:
    dataset_DF.loc[m['uidx'], 'owner_mestiere'] = m['owner_mestiere']

### ⚠️⚠️⚠️ Write the updated dataset to file

In [None]:
dataset_DF.to_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step5.json', orient='records', index=['uidx'])

----------------------------------------------------------

In [None]:
import time
print("Current time:", time.strftime("%H:%M:%S", time.localtime()))