In [None]:
import pandas as pd

# Add the parent directory of the 'utils' directory to the Python path
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import utils.utils as u
import utils.utils_people as up

-------------------------------------------------------------------------------------------------------------------

# Assignation of standardised mapping for family names, entities, titles, mestieri

-------------------------------------------------------------------------------------------------------------------

#### Load dataset

In [None]:
dataset_DF = pd.read_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step7.json')
dataset_DF.head()

#### Load necessary dictionaries

In [None]:
family_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/family_names.json')[0].to_list(), key=len, reverse=True)
first_names = sorted(pd.read_json('../dictionaries/PPL_dictionary/first_names.json')[0].to_list(), key=len, reverse=True)
titles = sorted(pd.read_json('../dictionaries/PPL_dictionary/titles.json')[0].to_list(), key=len, reverse=True)
titles_plur = sorted(pd.read_json('../dictionaries/PPL_dictionary/titles_plur.json')[0].to_list(), key=len, reverse=True)
titles_sing = sorted(pd.read_json('../dictionaries/PPL_dictionary/titles_sing.json')[0].to_list(), key=len, reverse=True)
unknown_relatives_plur = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives_plur.json')[0].to_list(), key=len, reverse=True)
unknown_relatives_sing = sorted(pd.read_json('../dictionaries/PPL_dictionary/unknown_relatives_sing.json')[0].to_list(), key=len, reverse=True)

mestieri = sorted(pd.read_json('../dictionaries/PPL_dictionary/mestieri.json')[0].to_list(), key=len, reverse=True)

In [None]:
import json

# FAMILY GROUPS
with open('../dictionaries/std_mappings/people_to_mentions/family_name_groups.json') as f: 
    data = f.read() 
family_name_groups = json.loads(data)   

# TITLES
with open('../dictionaries/std_mappings/people_to_mentions/title_to_std.json') as f: 
    data = f.read() 
title_to_std_ = json.loads(data)   

with open('../dictionaries/std_mappings/people_to_mentions/title_plur_to_sing.json') as f: 
    data = f.read() 
title_plur_to_sing_ = json.loads(data)   

# MESTIERI
with open('../dictionaries/std_mappings/people_to_mentions/mestieri_to_std.json') as f: 
    data = f.read()
mestieri_to_std_ = json.loads(data)


## Family group standardisation

In [None]:
def assign_family_group(family_name):
    if family_name == '-':
        return '-'
    
    for group in family_name_groups:
        if family_name in group['name_variations']:
            return group['std_name']
        
    parts = up.populate_owner_name_array(family_name.split(' '), family_names)
    standardized_parts = []
    for part in parts:
        found_part = False
        for group in family_name_groups:
            if part in group['name_variations']:
                standardized_parts.append(group['std_name'])
                found_part = True
                break
        if not found_part:
            print("FAMILY NAME NOT FOUND:", part)
            standardized_parts.append(part)
    
    return ' '.join(standardized_parts)

In [None]:
family_groups_to_commit = []

dataset_family_names = dataset_DF[dataset_DF['owner_family_name'].str.len() > 0]
for index, row in dataset_family_names.iterrows():
    uidx = row['uidx']
    family_groups = []
    
    for family_name in row['owner_family_name'].split(' | '):
        standardized_family_name = assign_family_group(family_name.lower()) 
        family_groups.append(standardized_family_name.upper())
    
    family_group_joined = ' | '.join(family_groups)
    
    title_modification = {
        'uidx': uidx,
        'family_group': family_group_joined
    }
    
    family_groups_to_commit.append(title_modification)

print("There are", len(family_groups_to_commit), "total modifications.")


In [None]:
for m in family_groups_to_commit:
    dataset_DF.loc[m['uidx'], 'owner_family_group'] = m['family_group']

## Entity standardisation

In [None]:
ent_code_file_mapping = {
    'ent_GLD': ('mestieri_to_mentions', 'guild_mentions'),
    'ent_REL': ('religious_to_mentions', 'entity_mentions'),
    'ent_REL_TTL': ('religious_to_mentions', 'title_mentions'),
    'ent_SCL_GRD': ('scuole_grandi_to_mentions', 'entity_mentions'),
    'ent_SCL_MST': ('scuole_mestieri_to_mentions', 'entity_mentions'),
    'ent_SCL_REL': ('scuole_religious_to_mentions', 'entity_mentions'),
    'ent_SCR': ('social_care_to_mentions', 'entity_mentions'),
    'ent_VNZ': ('venezia_to_mentions', 'entity_mentions'),
    'ent_VNZ_TTL': ('venezia_to_mentions', 'title_mentions'),
    'ent_OTH': ('', ''),
}

In [None]:
def assign_entity_group(owner_entity, owner_code):
    code = owner_code
    if code.startswith('ent_OTH') or code.startswith('ent_JEW') or 'UNL' in code:
        return owner_entity.upper()

    if code.endswith('_m'):
        code = code[:-2]
        
    filename, key = ent_code_file_mapping.get(code, ('', ''))
    if filename and key:
        with open(f'../dictionaries/std_mappings/entities_to_mentions/{filename}.json') as f: 
            data = f.read()
        mapping_ent_to_mentions = json.loads(data)
        
        for item in mapping_ent_to_mentions:
            if owner_entity in item.get(key, []):
                return item['entity']

        print("ENTITY NOT FOUND:", owner_entity)
    elif code != 'PPL':
        print("ENTITY CODE NOT FOUND:", code)
        
    return owner_entity.upper()

In [None]:
entity_groups_to_commit = []
dataset_entity_names = dataset_DF[dataset_DF['owner_code'].str.contains('ent_')]

for index, row in dataset_entity_names.iterrows():
    uidx = row['uidx']
    entity_names = row['owner_entity'].split(' | ')
    owner_codes = row['owner_code'].split(' | ')

    entity_groups = []
    
    # Iterate through each split entity name and its corresponding owner code
    for entity_name, code in zip(entity_names, owner_codes):
        standardized_entity_name = assign_entity_group(entity_name.lower(), code)
        entity_groups.append(standardized_entity_name)
    
    entity_group_joined = ' | '.join(entity_groups)
    entity_modification = {
        'uidx': uidx,
        'entity_group': entity_group_joined
    }
    entity_groups_to_commit.append(entity_modification)

print("There are", len(entity_groups_to_commit), "total modifications.")


In [None]:
for m in entity_groups_to_commit:
    dataset_DF.loc[m['uidx'], 'owner_entity_group'] = m['entity_group']

## Title standardisation

In [None]:
title_to_std = {}
for e in title_to_std_:
    title_to_std[e['key']] = e['value']
    
title_plur_to_sing = {}
for e in title_plur_to_sing_:
    title_plur_to_sing[e['key']] = e['value']
    

In [None]:
def standardise_titles(title_text):
    titles_list = [u.remove_extra_spaces(t) for t in title_text.split('|')]
    standardised_titles = []
    for title in titles_list:
        comp_titles = [u.remove_extra_spaces(t) for t in title.split('&')]
        ttls_ = []
        for t in comp_titles:
            if t in title_to_std:
                if title_to_std[t] not in ttls_:
                    ttls_.append(title_to_std[t])
            else:
                if t == '-':
                    ttls_.append('-')
                else:
                    print('Title not found in std: ', t)
        
        standardised_titles.append(' & '.join(ttls_))
        
    return ' | '.join(standardised_titles)

In [None]:
def propagate_titles_plur(title_text, first_name_minimal):
    titles_list = [u.remove_extra_spaces(t) for t in title_text.split('|')]
    fn_minimal_list = [u.remove_extra_spaces(f) for f in first_name_minimal.split('|')]

    if len(titles_list) != len(fn_minimal_list):
        return "???"
        
    processed_titles = []
    title_to_propagate = None
    
    for i, title in enumerate(titles_list):
        assigned_std_title = []
        fn = fn_minimal_list[i].replace('_', '')
        title_arr = [u.remove_extra_spaces(t) for t in title.split('&')]
        
        if title_to_propagate is not None:
            if fn in first_names or fn in unknown_relatives_sing:
                assigned_std_title.append(title_plur_to_sing[title_to_propagate])
            elif fn in unknown_relatives_plur:
                assigned_std_title.append(title_to_propagate)
            elif fn != '-':
                print('First name format not found:', fn)
                    
        for title in title_arr:
            if title in titles_plur:
                title_to_propagate = title
                if fn in first_names or fn in unknown_relatives_sing:
                    assigned_std_title.append(title_plur_to_sing[title_to_propagate])
                elif fn in unknown_relatives_plur or fn == '-':
                    assigned_std_title.append(title_to_propagate)
                
            elif title in titles_sing:
                assigned_std_title.append(title)
            
            elif title == '-' and title_to_propagate is None:
                assigned_std_title.append('-')
                
            elif title != '-':
                print("Title not found: ", title)
                
        processed_titles.append(' & '.join(assigned_std_title))
                
        title_res = ' | '.join(processed_titles)
        if title_res.endswith(' | '):
            title_res = title_res[:-3] 
            
    return title_res
    

In [None]:
title_modifications_to_commit = []

dataset_titled_DF = dataset_DF[dataset_DF['owner_title'].str.len() > 0]
for index, row in dataset_titled_DF.iterrows():
    
    uidx = row['uidx']
    title_text = row['owner_title']
    title_text_minimal = u.text_to_minimal(title_text)
    title_text_minimal = standardise_titles(title_text_minimal)
    
    first_name_minimal = u.text_to_minimal(row['owner_first_name'])
    if '|' in title_text_minimal:
        title_text_minimal = propagate_titles_plur(title_text_minimal, first_name_minimal)
        
    title_modification = {
        'uidx': uidx, 
        'title': u.remove_extra_spaces(title_text_minimal).upper()
    }

    title_modifications_to_commit.append(title_modification)

print("There are", len(title_modifications_to_commit), "total modifications.")

In [None]:
for m in title_modifications_to_commit:
    dataset_DF.loc[m['uidx'], 'owner_title_std'] = m['title']

## Mestieri standardisation

In [None]:
mestieri_to_std = {}
for e in mestieri_to_std_:
    mestieri_to_std[e['key']] = e['value']

In [None]:
def standardise_mestieri(mestieri_text):
    mestieri_list = [u.remove_extra_spaces(t) for t in mestieri_text.split('|')]
    standardised_mestieri = []
    for mestiere in mestieri_list:
        if mestiere == '-':
            standardised_mestieri.append('-')
        elif mestiere in mestieri_to_std:
            standardised_mestieri.append(mestieri_to_std[mestiere])
        else:
            print('Mestiere not found in std: ', mestiere)
        
    return ' | '.join(standardised_mestieri)

In [None]:
mestieri_modifications_to_commit = []

dataset_mestieri_DF = dataset_DF[dataset_DF['owner_mestiere'].str.len() > 0]
for index, row in dataset_mestieri_DF.iterrows():
    
    uidx = row['uidx']
    mestiere_text = row['owner_mestiere']
    mestiere_text_minimal = u.text_to_minimal(mestiere_text)
    mestiere_text_minimal = standardise_mestieri(mestiere_text_minimal)
        
    mestiere_modification = {
        'uidx': uidx, 
        'mestiere': u.remove_extra_spaces(mestiere_text_minimal).upper()
    }

    mestieri_modifications_to_commit.append(mestiere_modification)

print("There are", len(mestieri_modifications_to_commit), "total modifications.")

In [None]:
for m in mestieri_modifications_to_commit:
    dataset_DF.loc[m['uidx'], 'owner_mestiere_std'] = m['mestiere']

-------------------------------------------------------------------------------------------------

In [None]:
dataset_DF.to_json('../data_catastici/data_post-processing/pipeline_steps/catastici_1741_step8.json', orient='records', index=['uidx'])

-------------------------------------------------------------------------------------------------

In [None]:
import time
print("Current time:", time.strftime("%H:%M:%S", time.localtime()))