In [None]:
import pandas as pd

# Add the parent directory of the 'utils' directory to the Python path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


import utils.utils as u
import utils.utils_people as up
import utils.utils_entities as ue

-------------------------------------------------------------------------------------------------------------------

# Assignation of parcels owned by the Republic of Venice

-------------------------------------------------------------------------------------------------------------------

### Load dataset

In [None]:
dataset_DF = pd.read_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step2.json')

#### Select TODO entries and format

In [None]:
dataset_unresolved_DF = dataset_DF[dataset_DF['owner_code'] == 'TODO'].copy()
dataset_unresolved_DF.loc[:, 'owner_name'] = dataset_unresolved_DF['owner_name'].map(lambda x: u.text_to_minimal((x)))
print("There are", len(dataset_unresolved_DF), "unresolved parcel entries.")

In [None]:
dataset_unresolved_DF.head()

#### Ignore entries with multiple owners

We ignore these entries since they will be managed by a future stage of the pipeline.

In [None]:
separators = sorted(pd.read_json('../dictionaries/MSC_dictionary/separators.json')[0].to_list(), key=len, reverse=True)
dataset_unresolved_DF = dataset_unresolved_DF[dataset_unresolved_DF['owner_name'].apply(lambda x: not u.has_multiple_owners(x, separators, use_minimal=False))]
print("There are", len(dataset_unresolved_DF), "relevant parcel entries.")

#### Load necessary dictionaries

In [None]:
venezia_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/venezia_entities.json')[0].to_list(), key=len, reverse=True)
venezia_titles_entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/venezia_titles_entities.json')[0].to_list(), key=len, reverse=True)
unlinked = pd.read_json('../dictionaries/ENT_dictionary/unlinked_entities.json')
venezia_entities_unlinked = sorted(unlinked[unlinked['entities_type'] == 'venezia_entities']['entities_unlinked'], key=len, reverse=True)[0]

entities = sorted(pd.read_json('../dictionaries/ENT_dictionary/entities.json')[0].to_list(), key=len, reverse=True)
family_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/family_names.json')[0].to_list(), key=len, reverse=True)
family_names_comp = sorted(pd.read_json('../dictionaries/PPL_dictionary/family_names_comp.json')[0].to_list(), key=len, reverse=True)
first_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/first_names.json')[0].to_list(), key=len, reverse=True)
mestieri = sorted(pd.read_json('../dictionaries/PPL_dictionary/mestieri.json')[0].to_list(), key=len, reverse=True)
titles = sorted(pd.read_json('../dictionaries/PPL_dictionary/titles.json')[0].to_list(), key=len, reverse=True)
ignore_next = sorted(pd.read_json('../dictionaries/MSC_dictionary/ignore_next.json')[0].to_list(), key=len, reverse=True)
cities = sorted(pd.read_json('../dictionaries/MSC_dictionary/cities.json')[0].to_list(), key=len, reverse=True)
collisions = [c for c in family_names if (c in titles or c in mestieri or c in entities)]
places = sorted(pd.read_json('../dictionaries/MSC_dictionary/places.json')[0].to_list(), key=len, reverse=True)

## Start parcel assignation

In [None]:
def print_commit(uidx, owner):
    print('----------------------------------------------------------')
    print("UIDX", uidx)
    print("DEMANIO TEXT  -", owner.upper())

In [None]:
modifications_to_commit = []
VNZ = sorted(venezia_entities + venezia_titles_entities + venezia_entities_unlinked, key=len, reverse=True)

In [None]:
for index, row in dataset_unresolved_DF.iterrows():
    
    uidx = row['uidx']
    owner_text_minimal = row['owner_name']

    for v in VNZ:
        if owner_text_minimal == v:
            modifications_to_commit.append({ 'uidx': uidx, 'owner_text_std': owner_text_minimal })
            print_commit(uidx, owner_text_minimal)
            break

        elif u.string_contains_substring(owner_text_minimal, v, use_minimal=False):
            name_extraction = up.extract_names(
                owner_text_minimal,
                family_names,
                first_names,
                collisions,
                ignore_next,
                cities,
                places)
            is_person = (name_extraction is not None 
                        and name_extraction['name_labeled_map'].get('FIRST_NAME') is not None
                        and name_extraction['name_labeled_map'].get('LAST_NAME') is not None
                        and len(name_extraction['name_labeled_map']['FIRST_NAME']) > 0
                        and len(name_extraction['name_labeled_map']['LAST_NAME']) > 0)
            
            if not is_person:
                modifications_to_commit.append({ 'uidx': uidx, 'owner_text_std': v })
                print_commit(uidx, owner_text_minimal)
            break

print()
print()
print("There are", len(modifications_to_commit), "total modifications.")

### Commit modifications in dataset

In [None]:
for m in modifications_to_commit:
    m_uidx = m['uidx']
    code = ue.get_entity_code_from_mention(
        mention=m['owner_text_std'],
        venezia_entities=venezia_entities,
        venezia_titles_entities=venezia_titles_entities,
        venezia_entities_unlinked=venezia_entities_unlinked
    )

    dataset_DF.loc[m_uidx, 'owner_code'] = code
    dataset_DF.loc[m_uidx, 'owner_count'] = 1
    dataset_DF.loc[m_uidx, 'owner_entity'] = m['owner_text_std'].upper()
    
    if ue.is_title_code(code):
        dataset_DF.loc[m_uidx, 'owner_title'] = m['owner_text_std']


### ⚠️⚠️⚠️ Write the updated dataset to file

In [None]:
dataset_DF.to_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step3.json', orient='records', index=['uidx'])

----------------------------------------------------------

In [None]:
import time
print("Current time:", time.strftime("%H:%M:%S", time.localtime()))