# Data Cleaning

This notebook handles the data cleaning process for the datasets, documenting the process at the same time.

In [1]:
# Libraries
import json
import pandas as pd
import numpy as np
import re
from pathlib import Path


## Raw Datasets

The raw datasets are stored in the `data/raw` directory. The datasets include:

- `bautismos.csv`: Baptism records
- `matrimonios.csv`: Marriage records
- `entierros.csv`: Burial records

In [2]:
BAUTISMOS_RAW = pd.read_csv("../data/raw/bautismos.csv")
MATRIMONIOS_RAW = pd.read_csv("../data/raw/matrimonios.csv")
ENTIERROS_RAW = pd.read_csv("../data/raw/entierros.csv")

BAUTISMOS_RAW.head()

Unnamed: 0,Secuencia,Unidad Documental Compuesta (a la que pertenece),Identificador (es recomendable seguir una secuencia numeral como la mostrada en los ejemplos),Título (incluir un título breve para cada documento),Folio inicial del documento (convertir como se muestra abajo),Folio final del documento (convertir como se muestra abajo),Imagen inicial (estos valores serán añadidos cuando comienze el proceso de revisión de imágenes),Imagen final (estos valores serán añadidos cuando comienze el proceso de revisión de imágenes),Tipo de evento,Fecha aaaa-mm-dd,...,Condición de la madrina,Lugar de bautizo,Notas adicionales del documento,Descriptor Geográfico 1,Descriptor Geográfico 2,Descriptor Geográfico 3,Descriptor Geográfico 4,5,Características físicas (Estado de conservación de los materiales físicos),Historia de revisión (de los materiales digitalizados)
0,1.0,APAucará LB L001,B001,Bautizo. Domingo. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-04,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
1,2.0,APAucará LB L001,B002,Bautizo. Dominga. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-06,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
2,3.0,APAucará LB L001,B003,Bautizo. Bartola. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-07,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
3,4.0,APAucará LB L001,B004,Bautizo. Francisca,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Abreviatura poco visible en el margen,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023
4,5.0,APAucará LB L001,B005,Bautizo. Pedro,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Margen roto y manchado de tinta,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023


## Column Harmonization

Rename the columns in the datasets to ensure consistency across different data sources.

Values are mapped using the `json` files located in the `data/mappings` directory.

In [3]:
from utils.ColumnManager import ColumnManager

In [4]:
bautismoMapping = Path("../data/mappings/bautismosMapping.json")
matrimonioMapping = Path("../data/mappings/matrimoniosMapping.json")
entierroMapping = Path("../data/mappings/entierrosMapping.json")

column_manager = ColumnManager()

BAUTISMOS_HARMONIZED = column_manager.harmonize_columns(BAUTISMOS_RAW, bautismoMapping)
MATRIMONIOS_HARMONIZED = column_manager.harmonize_columns(MATRIMONIOS_RAW, matrimonioMapping)
ENTIERROS_HARMONIZED = column_manager.harmonize_columns(ENTIERROS_RAW, entierroMapping)

BAUTISMOS_HARMONIZED.head()

Unnamed: 0,id,file,identifier,title,start_folio,end_folio,start_image,end_image,event_type,event_date,...,godmother_social_condition,event_place,event_additional_notes,event_geographic_descriptor_1,event_geographic_descriptor_2,event_geographic_descriptor_3,event_geographic_descriptor_4,event_other,record_physical_characteristics,revision_history
0,1.0,APAucará LB L001,B001,Bautizo. Domingo. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-04,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
1,2.0,APAucará LB L001,B002,Bautizo. Dominga. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-06,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
2,3.0,APAucará LB L001,B003,Bautizo. Bartola. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-07,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
3,4.0,APAucará LB L001,B004,Bautizo. Francisca,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Abreviatura poco visible en el margen,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023
4,5.0,APAucará LB L001,B005,Bautizo. Pedro,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Margen roto y manchado de tinta,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023


### Reduce DataFrames to their relevant columns

In [5]:
useful_columns = json.load(open("../data/mappings/usefulColumnsMapping.json"))


In [6]:
# Bautismos useful columns

BAUTISMOS_HARMONIZED = BAUTISMOS_HARMONIZED[useful_columns['bautizo']]

# remove empty columns
BAUTISMOS_HARMONIZED.dropna(axis=1, how='all', inplace=True)

BAUTISMOS_HARMONIZED.columns

Index(['file', 'identifier', 'event_type', 'event_date', 'baptized_name',
       'baptized_hometown', 'baptized_birth_date',
       'baptized_legitimacy_status', 'father_name', 'father_lastname',
       'father_social_condition', 'mother_name', 'mother_lastname',
       'mother_social_condition', 'parents_social_condition', 'godfather_name',
       'godfather_lastname', 'godfather_social_condition', 'godmother_name',
       'godmother_lastname', 'godmother_social_condition', 'event_place',
       'event_geographic_descriptor_1', 'event_geographic_descriptor_2',
       'event_geographic_descriptor_3', 'event_geographic_descriptor_4'],
      dtype='object')

In [7]:
MATRIMONIOS_HARMONIZED = MATRIMONIOS_HARMONIZED[useful_columns['matrimonio']]

# remove empty columns
MATRIMONIOS_HARMONIZED.dropna(axis=1, how='all', inplace=True)

MATRIMONIOS_HARMONIZED.columns

Index(['file', 'identifier', 'event_type', 'event_date', 'groom_name',
       'groom_lastname', 'groom_social_condition', 'groom_marital_status',
       'groom_age', 'groom_hometown', 'groom_resident_in',
       'groom_legitimacy_status', 'groom_father_name', 'groom_father_lastname',
       'groom_father_social_condition', 'groom_mother_name',
       'groom_mother_lastname', 'groom_mother_social_condition', 'bride_name',
       'bride_lastname', 'bride_social_condition', 'bride_marital_status',
       'bride_age', 'bride_hometown', 'bride_resident_in',
       'bride_legitimacy_status', 'bride_father_name', 'bride_father_lastname',
       'bride_father_social_condition', 'bride_mother_name',
       'bride_mother_lastname', 'bride_mother_social_condition',
       'godparent_1_name', 'godparent_1_lastname',
       'godparent_1_social_condition', 'godparent_2_name',
       'godparent_2_lastname', 'godparent_2_social_condition',
       'godparent_3_name', 'godparent_3_lastname', 'witness_1_

In [8]:
ENTIERROS_HARMONIZED = ENTIERROS_HARMONIZED[useful_columns['entierro']]

# remove empty columns
ENTIERROS_HARMONIZED.dropna(axis=1, how='all', inplace=True)

ENTIERROS_HARMONIZED.columns

Index(['file', 'identifier', 'event_type', 'event_date', 'doctrine',
       'event_place', 'deceased_name', 'deceased_lastname', 'deceased_age',
       'deceased_hometown', 'deceased_condition', 'deceased_marital_status',
       'deceased_legitimacy_status', 'father_name', 'father_lastname',
       'mother_name', 'mother_lastname', 'husband_name', 'wife_name',
       'burial_place', 'event_geographic_descriptor_1',
       'event_geographic_descriptor_2', 'event_geographic_descriptor_3',
       'event_geographic_descriptor_4'],
      dtype='object')

## Replace empty or null values with 'na'



In [9]:
## replace cells with no textual information with numpy na
def replace_empty_with_na(df):
    """
    Replace placeholder strings with np.nan in string columns only.
    """
    placeholders = {'', '-', '--', 'n/a', 'na', 'null', 'None'}

    def clean_cell(val):
        if isinstance(val, str) and val.strip().lower() in placeholders:
            return np.nan
        return val

    return df.map(clean_cell)


In [10]:
BAUTISMOS_HARMONIZED = replace_empty_with_na(BAUTISMOS_HARMONIZED)
MATRIMONIOS_HARMONIZED = replace_empty_with_na(MATRIMONIOS_HARMONIZED)
ENTIERROS_HARMONIZED = replace_empty_with_na(ENTIERROS_HARMONIZED)

## Dates Normalization

Ensure dates are in a consistent format across all datasets. The dates should be in the format `YYYY-MM-DD`.

In [11]:
from actions.normalizers.DatesNormalizer import DateNormalizer

In [12]:
BAUTISMOS_HARMONIZED['event_date'] = DateNormalizer(BAUTISMOS_HARMONIZED['event_date']).normalize()
BAUTISMOS_HARMONIZED['event_date']

0       1790-10-04
1       1790-10-06
2       1790-10-07
3       1790-10-20
4       1790-10-20
           ...    
6336    1888-12-10
6337    1888-12-11
6338    1888-12-12
6339    1888-12-15
6340    1888-12-16
Name: event_date, Length: 6341, dtype: object

In [13]:
MATRIMONIOS_HARMONIZED['event_date'] = DateNormalizer(MATRIMONIOS_HARMONIZED['event_date']).normalize()
MATRIMONIOS_HARMONIZED['event_date']

0       1816-12-06
1       1816-12-12
2       1817-03-05
3       1817-03-10
4       1817-03-12
           ...    
1714    1907-10-27
1715    1908-01-13
1716    1908-01-15
1717    1908-02-15
1718    1908-03-17
Name: event_date, Length: 1719, dtype: object

In [14]:
ENTIERROS_HARMONIZED['event_date'] = DateNormalizer(ENTIERROS_HARMONIZED['event_date']).normalize()
ENTIERROS_HARMONIZED['event_date']

0       1846-10-06
1       1846-10-07
2       1846-11-02
3       1846-12-08
4       1847-02-23
           ...    
2193    1920-10-12
2194    1920-10-19
2195    1920-10-19
2196    1920-10-20
2197    1920-10-21
Name: event_date, Length: 2198, dtype: object

## Age Inferring

Infer the age of individuals based on their birthdates and the date of the event (baptism, marriage, burial).

In [15]:
from actions.generators.AgeInferrer import AgeInferrer

In [16]:
BAUTISMOS_HARMONIZED['baptized_birth_date'] = AgeInferrer(BAUTISMOS_HARMONIZED['event_date']).infer_all(BAUTISMOS_HARMONIZED['baptized_birth_date'])
BAUTISMOS_HARMONIZED[['event_date', 'baptized_birth_date']]

Unnamed: 0,event_date,baptized_birth_date
0,1790-10-04,1790-08-04
1,1790-10-06,1790-08-04
2,1790-10-07,1790-08-04
3,1790-10-20,1790-10-15
4,1790-10-20,1790-10-19
...,...,...
6336,1888-12-10,1888-12-09
6337,1888-12-11,1888-12-07
6338,1888-12-12,1888-12-06
6339,1888-12-15,1888-11-30


In [17]:
# Inconsistent dates: event_date should be after baptized_birth_date
invalid_mask = pd.to_datetime(BAUTISMOS_HARMONIZED['event_date'], errors='coerce') < pd.to_datetime(BAUTISMOS_HARMONIZED['baptized_birth_date'], errors='coerce')
if invalid_mask.any():
    print("Found invalid records:")
    print(BAUTISMOS_HARMONIZED[invalid_mask][['event_date', 'baptized_birth_date']])

Found invalid records:
      event_date baptized_birth_date
135   1792-03-29          1792-04-08
290   1794-01-01          1794-01-27
671   1797-07-15          1797-07-24
2814  1900-04-01          1900-04-09


> Unfortunately, these records were incorrectly recorded. To fix this, it is necessary to check with the original records.

In [18]:
MATRIMONIOS_HARMONIZED['groom_age'] = AgeInferrer(MATRIMONIOS_HARMONIZED['event_date']).infer_all(MATRIMONIOS_HARMONIZED['groom_age'])
MATRIMONIOS_HARMONIZED['bride_age'] = AgeInferrer(MATRIMONIOS_HARMONIZED['event_date']).infer_all(MATRIMONIOS_HARMONIZED['bride_age'])
MATRIMONIOS_HARMONIZED[['event_date', 'groom_age', 'bride_age']]


Unnamed: 0,event_date,groom_age,bride_age
0,1816-12-06,,
1,1816-12-12,,
2,1817-03-05,,
3,1817-03-10,,
4,1817-03-12,,
...,...,...,...
1714,1907-10-27,1882-11-01,1880-11-01
1715,1908-01-13,1880-01-19,1879-01-19
1716,1908-01-15,1886-01-19,1888-01-19
1717,1908-02-15,1883-02-20,1886-02-19


In [19]:
ENTIERROS_HARMONIZED['deceased_age'] = AgeInferrer(ENTIERROS_HARMONIZED['event_date']).infer_all(ENTIERROS_HARMONIZED['deceased_age'])
ENTIERROS_HARMONIZED[['event_date', 'deceased_age']]

Unnamed: 0,event_date,deceased_age
0,1846-10-06,
1,1846-10-07,1821-10-13
2,1846-11-02,1766-11-21
3,1846-12-08,1806-12-18
4,1847-02-23,1797-03-06
...,...,...
2193,1920-10-12,
2194,1920-10-19,
2195,1920-10-19,
2196,1920-10-20,


## Names Normalization

Standardize names across datasets to ensure consistency. This includes normalizing first names, last names, and any other relevant name fields.

In [20]:
from actions.normalizers.NamesNormalizer import NamesNormalizer

In [21]:
def normalize_names_columns(series):
    namesManager = NamesNormalizer()
    return namesManager.clean_series(series)

In [22]:
names_columns = [
    'baptized_name', 
    'father_name', 'father_lastname',
    'mother_name', 'mother_lastname',
    'godfather_name', 'godfather_lastname', 
    'godmother_name', 'godmother_lastname',
]

for col in names_columns:
    if col in BAUTISMOS_HARMONIZED.columns:
        BAUTISMOS_HARMONIZED[col] = normalize_names_columns(BAUTISMOS_HARMONIZED[col])

BAUTISMOS_HARMONIZED[names_columns]

Unnamed: 0,baptized_name,father_name,father_lastname,mother_name,mother_lastname,godfather_name,godfather_lastname,godmother_name,godmother_lastname
0,domingo,lucas,ayquipa,sevastiana,quispe,vicente,guamani,,
1,dominga,juan,lulia,jospha,gomes,ignacio,varientos,,
2,bartola,jacinto,quispe,juliana,chinchay,,,rotonda,pocco
3,francisca,juan,cuebas,clemenzia,manco,,,ysabel,guillen
4,pedro,santos,manxo,baleriana,arango,,,josefa,santiago
...,...,...,...,...,...,...,...,...,...
6336,leocadio,miguel,pacheco,rosa,huarcaya,josé julián,bendezú,,
6337,mariano concepcion,facundo,vega,silvestra,urbano,fernando,mancco,,
6338,ambrosio,ysidro,ccasane,rita,palomino,juan,tito,,
6339,francisco,mariano,lopez,leocadia,medina,feliciano,dias,,


In [23]:
matrimonios_names_columns = [
    'groom_name', 'groom_lastname', 
       'groom_father_name', 'groom_father_lastname', 
       'groom_mother_name', 'groom_mother_lastname',
       'bride_name', 'bride_lastname',
       'bride_father_name', 'bride_father_lastname',
       'bride_mother_name', 'bride_mother_lastname', 
       'godparent_1_name', 'godparent_1_lastname',
       'godparent_2_name', 'godparent_2_lastname', 
       'godparent_3_name', 'godparent_3_lastname', 
       'witness_1_name', 'witness_1_lastname', 
       'witness_2_name', 'witness_2_lastname',
       'witness_3_name', 'witness_3_lastname', 
       'witness_4_name', 'witness_4_lastname'
]

for col in matrimonios_names_columns:
    if col in MATRIMONIOS_HARMONIZED.columns:
        MATRIMONIOS_HARMONIZED[col] = normalize_names_columns(MATRIMONIOS_HARMONIZED[col])

MATRIMONIOS_HARMONIZED[matrimonios_names_columns]

Unnamed: 0,groom_name,groom_lastname,groom_father_name,groom_father_lastname,groom_mother_name,groom_mother_lastname,bride_name,bride_lastname,bride_father_name,bride_father_lastname,...,godparent_3_name,godparent_3_lastname,witness_1_name,witness_1_lastname,witness_2_name,witness_2_lastname,witness_3_name,witness_3_lastname,witness_4_name,witness_4_lastname
0,josé manl manuel,de la roca,acencio,roca,leonor,guerrero,juana,rodrigues,pedro,rodrigues,...,,,agustin,castro,mariano,castro,juan,baldes,,
1,esteban,castillo,matheo,castillo,ma maria,torres,ambrocia,tasqui,pedro,tasqui,...,,,pedro,manco,carlos,canto,pedro,guamani,,
2,alexandro,ramires,leonor,romani,franca francisca,paucar,sipriana,coillo,cristobal,coillo,...,,,marcelo,llamuca,julian,urbano,antonio,urbano,,
3,jose,cuchu,acencio,cuchu,baleriana,antay,cacimira,flores,,,...,,,pablo,roque,antonio,urbano,cristobal,coillo,,
4,domingo,tito,,,marcela,guauya,petrona,guallpatuiru,agustin,guallpatuiru,...,,,marcelo,llamuca,antonio,guamani,mariano,guallpatuiru,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1714,patrocinio,chinchay,miguel,chinchay,andrea,polanco,lorenza,quispe,gervacio,quispe,...,,,juan de dios,barrientos,manuel,espinosa,crisostomo,pumarino,,
1715,gerónimo,cucho,ambrocio,cucho,gertrudis,serrano,teresa,jimenes,aniseto,jimenes,...,,,victor,saravia,mateo,aiquipa,felix,cucho,,
1716,josé,coro,felix,coro,natividad,cucho,emilia,huamani,patricio,huamani,...,,,pablo,de la cruz,victor,saravia,marcelo,ramos,,
1717,pedro,gutierres,ruperto,gutierrez,micaila,oscco,juliana,huarcaya,hilario,huarcaya,...,,,rafael,delgado,josé,vivanco,agustin,vicente,,


In [24]:
entierros_names_columns = [
    "deceased_name", "deceased_lastname",
    "father_name", "father_lastname",
    "mother_name", "mother_lastname",
    "husband_name", "wife_name",
]

for col in entierros_names_columns:
    if col in ENTIERROS_HARMONIZED.columns:
        ENTIERROS_HARMONIZED[col] = normalize_names_columns(ENTIERROS_HARMONIZED[col])

ENTIERROS_HARMONIZED[entierros_names_columns]

Unnamed: 0,deceased_name,deceased_lastname,father_name,father_lastname,mother_name,mother_lastname,husband_name,wife_name
0,julian,xavies,,,,,,mercedes lupa
1,joce,raime,,,,,,francisca cucho
2,martina,condori,,,,,luciano ccoyllo,
3,dorotea,ccoyllo,,,,,josé espinosa,
4,maría,romani,,,,,mariano huallpatuiro,
...,...,...,...,...,...,...,...,...
2193,nieves,huallpatuero,patrocinio,huallpatuero,teresa,urbano,,
2194,sinforiano,huamani,eustaquio,huamani,martina,llamoca,,gregoria ccoillo
2195,salomé,condori,sebastian,condori,anacla,roque,,
2196,maría,chinchay,abdon,chinchay,floriza,lopez,,


## Place Recognition

This process involves NER recognition to extract places entities from the text fields in the datasets.

In [25]:
from actions.extractors import placeRecognition

extractor = placeRecognition.PlaceExtractor()

In [26]:
bautismos_place_columns = [
    'baptized_hometown', 'event_place', 'event_geographic_descriptor_1',
        'event_geographic_descriptor_2', 'event_geographic_descriptor_3',
        'event_geographic_descriptor_4'
]

BAUTISMOS_PLACES_RAW = BAUTISMOS_HARMONIZED[bautismos_place_columns]

for col in bautismos_place_columns:
    if col in BAUTISMOS_HARMONIZED.columns:
        BAUTISMOS_HARMONIZED[col] = extractor.extract_places_per_row(BAUTISMOS_HARMONIZED[col])

BAUTISMOS_HARMONIZED[bautismos_place_columns]

Unnamed: 0,baptized_hometown,event_place,event_geographic_descriptor_1,event_geographic_descriptor_2,event_geographic_descriptor_3,event_geographic_descriptor_4
0,,Pampamarca,,Pampamarca,,
1,,Pampamarca,,Pampamarca,,
2,,Pampamarca,,Pampamarca,,
3,,Aucara,,,,
4,,Aucara,,,,
...,...,...,...,...,...,...
6336,,Aucará,,,,
6337,,Aucará,,,,
6338,,Aucará,,Mayobamba,,
6339,,Aucará,,Huaicahuacho,,


In [27]:
matrimonios_place_columns = [
    'groom_hometown',
       'groom_resident_in', 
       'bride_hometown', 'bride_resident_in', 
       'event_place', 'event_geographic_descriptor_1', 'event_geographic_descriptor_2',
       'event_geographic_descriptor_3', 'event_geographic_descriptor_4',
       'event_geographic_descriptor_5', 'event_geographic_descriptor_6'
]

MATRIMONIOS_PLACES_RAW = MATRIMONIOS_HARMONIZED[matrimonios_place_columns]

for col in matrimonios_place_columns:
    if col in MATRIMONIOS_HARMONIZED.columns:
        MATRIMONIOS_HARMONIZED[col] = extractor.extract_places_per_row(MATRIMONIOS_HARMONIZED[col])

MATRIMONIOS_HARMONIZED[matrimonios_place_columns]

Unnamed: 0,groom_hometown,groom_resident_in,bride_hometown,bride_resident_in,event_place,event_geographic_descriptor_1,event_geographic_descriptor_2,event_geographic_descriptor_3,event_geographic_descriptor_4,event_geographic_descriptor_5,event_geographic_descriptor_6
0,Ciudad de Huamanga,Aucara,,,Aucara,,Huamanga,,,,
1,,,,,Aucara,,Colca,,,,
2,Pampamarca,,Pampamarca,,Aucara,,Pampamarca,,,,
3,Pampamarca,,Pampamarca,,Pampamarca|santa iglesia,,Pampamarca,,,,
4,,,,,Pampamarca|santa iglesia,,Pampamarca,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1714,Pampamarca,,Pampamarca,,Pampamarca,,Pampamarca,,,,
1715,,,,,,,,,,,
1716,,,,,,,,,,,
1717,,,,,Aucara,,,,,,


In [28]:
entierros_place_columns = [
    'event_place', 'deceased_hometown', 'burial_place', 'event_geographic_descriptor_1',
    'event_geographic_descriptor_2', 'event_geographic_descriptor_3',
    'event_geographic_descriptor_4'
]

ENTIERROS_PLACES_RAW = ENTIERROS_HARMONIZED[entierros_place_columns]

for col in entierros_place_columns:
    if col in ENTIERROS_HARMONIZED.columns:
        ENTIERROS_HARMONIZED[col] = extractor.extract_places_per_row(ENTIERROS_HARMONIZED[col])

ENTIERROS_HARMONIZED[entierros_place_columns]

Unnamed: 0,event_place,deceased_hometown,burial_place,event_geographic_descriptor_1,event_geographic_descriptor_2,event_geographic_descriptor_3,event_geographic_descriptor_4
0,,,,,Lucanas,,
1,,,,,Lucanas,,
2,,,,,Lucanas,,
3,,,,,Lucanas,,
4,,,,,Lucanas,,
...,...,...,...,...,...,...,...
2193,Aucara,Santa Ana de Aucara,,,Santa Ana de Aucara,,
2194,Aucara,Pampamarca,,,Pampamarca,,
2195,Aucara,Santa Ana de Aucara,,,Santa Ana de Aucara,,
2196,Aucara,,,,,,


In [29]:
# save raw places

from utils import UniqueValues

uniqueValues = UniqueValues.UniqueValuesExtractor(
    [BAUTISMOS_PLACES_RAW, MATRIMONIOS_PLACES_RAW, ENTIERROS_PLACES_RAW]
).get_unique_values(return_dataframe=True)

uniqueValues.to_csv("../data/raw/raw_places.csv", index=False) # type: ignore # 

## Cleaning Audit

Basic audit to ensure that the data cleaning process has been successful.

## Save Cleaned Data

In [30]:
clean_data_folder = Path("../data/clean/")

# Fill NaN values for consistency
BAUTISMOS_HARMONIZED = BAUTISMOS_HARMONIZED.fillna(value=np.nan)
MATRIMONIOS_HARMONIZED = MATRIMONIOS_HARMONIZED.fillna(value=np.nan)
ENTIERROS_HARMONIZED = ENTIERROS_HARMONIZED.fillna(value=np.nan)

BAUTISMOS_HARMONIZED.to_csv(clean_data_folder / "bautismos_clean.csv", index=False)
MATRIMONIOS_HARMONIZED.to_csv(clean_data_folder / "matrimonios_clean.csv", index=False)
ENTIERROS_HARMONIZED.to_csv(clean_data_folder / "entierros_clean.csv", index=False)

In [31]:
BAUTISMOS_HARMONIZED.head()

Unnamed: 0,file,identifier,event_type,event_date,baptized_name,baptized_hometown,baptized_birth_date,baptized_legitimacy_status,father_name,father_lastname,...,godfather_lastname,godfather_social_condition,godmother_name,godmother_lastname,godmother_social_condition,event_place,event_geographic_descriptor_1,event_geographic_descriptor_2,event_geographic_descriptor_3,event_geographic_descriptor_4
0,APAucará LB L001,B001,Bautizo,1790-10-04,domingo,,1790-08-04,Hijo legitimo,lucas,ayquipa,...,guamani,,,,,Pampamarca,,Pampamarca,,
1,APAucará LB L001,B002,Bautizo,1790-10-06,dominga,,1790-08-04,Hija legitima,juan,lulia,...,varientos,,,,,Pampamarca,,Pampamarca,,
2,APAucará LB L001,B003,Bautizo,1790-10-07,bartola,,1790-08-04,Hija legitima,jacinto,quispe,...,,,rotonda,pocco,,Pampamarca,,Pampamarca,,
3,APAucará LB L001,B004,Bautizo,1790-10-20,francisca,,1790-10-15,Hija legitima,juan,cuebas,...,,,ysabel,guillen,,Aucara,,,,
4,APAucará LB L001,B005,Bautizo,1790-10-20,pedro,,1790-10-19,Hijo legitimo,santos,manxo,...,,,josefa,santiago,,Aucara,,,,


In [32]:
MATRIMONIOS_HARMONIZED.head()

Unnamed: 0,file,identifier,event_type,event_date,groom_name,groom_lastname,groom_social_condition,groom_marital_status,groom_age,groom_hometown,...,witness_3_lastname,witness_4_name,witness_4_lastname,event_place,event_geographic_descriptor_1,event_geographic_descriptor_2,event_geographic_descriptor_3,event_geographic_descriptor_4,event_geographic_descriptor_5,event_geographic_descriptor_6
0,APAucará LM L001,M001,Matrimonio,1816-12-06,josé manl manuel,de la roca,"don, vecinos de este pueblo [Aucara]",soltero,,Ciudad de Huamanga,...,baldes,,,Aucara,,Huamanga,,,,
1,APAucará LM L001,M002,Matrimonio,1816-12-12,esteban,castillo,vecinos de esta doctrina [Aucara],soltero,,,...,guamani,,,Aucara,,Colca,,,,
2,APAucará LM L001,M003,Matrimonio,1817-03-05,alexandro,ramires,vecinos de dicho pueblo [Aucara],soltero,,Pampamarca,...,urbano,,,Aucara,,Pampamarca,,,,
3,APAucará LM L001,M004,Matrimonio,1817-03-10,jose,cuchu,vecinos de dicho [Pampamarca],soltero,,Pampamarca,...,coillo,,,Pampamarca|santa iglesia,,Pampamarca,,,,
4,APAucará LM L001,M005,Matrimonio,1817-03-12,domingo,tito,vecinos de dicho [Pampamarca],soltero,,,...,guallpatuiru,,,Pampamarca|santa iglesia,,Pampamarca,,,,


In [33]:
ENTIERROS_HARMONIZED.head()

Unnamed: 0,file,identifier,event_type,event_date,doctrine,event_place,deceased_name,deceased_lastname,deceased_age,deceased_hometown,...,father_lastname,mother_name,mother_lastname,husband_name,wife_name,burial_place,event_geographic_descriptor_1,event_geographic_descriptor_2,event_geographic_descriptor_3,event_geographic_descriptor_4
0,APAucará LD L001,E001,Entierro,1846-10-06,Parroquia de Aucará,,julian,xavies,,,...,,,,,mercedes lupa,,,Lucanas,,
1,APAucará LD L001,E002,Entierro,1846-10-07,Parroquia de Aucará,,joce,raime,1821-10-13,,...,,,,,francisca cucho,,,Lucanas,,
2,APAucará LD L001,E003,Entierro,1846-11-02,Parroquia de Aucará,,martina,condori,1766-11-21,,...,,,,luciano ccoyllo,,,,Lucanas,,
3,APAucará LD L001,E004,Entierro,1846-12-08,Parroquia de Aucará,,dorotea,ccoyllo,1806-12-18,,...,,,,josé espinosa,,,,Lucanas,,
4,APAucará LD L001,E005,Entierro,1847-02-23,Parroquia de Aucará,,maría,romani,1797-03-06,,...,,,,mariano huallpatuiro,,,,Lucanas,,
