# Data Cleaning

This notebook handles the data cleaning process for the datasets, documenting the process at the same time.

In [1]:
# Libraries

import pandas as pd
from pathlib import Path
from georesolver import PlaceResolver


In [2]:
# Custom modules
from utils.ColumnManager import ColumnManager

## Raw Datasets

The raw datasets are stored in the `data/raw` directory. The datasets include:

- `bautismos.csv`: Baptism records
- `matrimonios.csv`: Marriage records
- `entierros.csv`: Burial records

In [3]:
BAUTISMOS_RAW = pd.read_csv("../data/raw/bautismos.csv")
MATRIMONIOS_RAW = pd.read_csv("../data/raw/matrimonios.csv")
ENTIERROS_RAW = pd.read_csv("../data/raw/entierros.csv")

BAUTISMOS_RAW.head()

Unnamed: 0,Secuencia,Unidad Documental Compuesta (a la que pertenece),Identificador (es recomendable seguir una secuencia numeral como la mostrada en los ejemplos),Título (incluir un título breve para cada documento),Folio inicial del documento (convertir como se muestra abajo),Folio final del documento (convertir como se muestra abajo),Imagen inicial (estos valores serán añadidos cuando comienze el proceso de revisión de imágenes),Imagen final (estos valores serán añadidos cuando comienze el proceso de revisión de imágenes),Tipo de evento,Fecha aaaa-mm-dd,...,Condición de la madrina,Lugar de bautizo,Notas adicionales del documento,Descriptor Geográfico 1,Descriptor Geográfico 2,Descriptor Geográfico 3,Descriptor Geográfico 4,5,Características físicas (Estado de conservación de los materiales físicos),Historia de revisión (de los materiales digitalizados)
0,1.0,APAucará LB L001,B001,Bautizo. Domingo. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-04,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
1,2.0,APAucará LB L001,B002,Bautizo. Dominga. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-06,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
2,3.0,APAucará LB L001,B003,Bautizo. Bartola. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-07,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
3,4.0,APAucará LB L001,B004,Bautizo. Francisca,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Abreviatura poco visible en el margen,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023
4,5.0,APAucará LB L001,B005,Bautizo. Pedro,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Margen roto y manchado de tinta,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023


## Column Harmonization

Rename the columns in the datasets to ensure consistency across different data sources.

Values are mapped using the `json` files located in the `data/mappings` directory.

In [4]:
bautismoMapping = Path("../data/mappings/bautismosMapping.json")
matrimonioMapping = Path("../data/mappings/matrimoniosMapping.json")
entierroMapping = Path("../data/mappings/entierrosMapping.json")

column_manager = ColumnManager()

BAUTISMOS_HARMONIZED = column_manager.harmonize_columns(BAUTISMOS_RAW, bautismoMapping)
MATRIMONIOS_HARMONIZED = column_manager.harmonize_columns(MATRIMONIOS_RAW, matrimonioMapping)
ENTIERROS_HARMONIZED = column_manager.harmonize_columns(ENTIERROS_RAW, entierroMapping)

BAUTISMOS_HARMONIZED.head()

Unnamed: 0,id,file,identifier,title,start_folio,end_folio,start_image,end_image,event_type,date,...,godmother_social_condition,baptism_place,additional_notes,geographic_descriptor_1,geographic_descriptor_2,geographic_descriptor_3,geographic_descriptor_4,other,record_physical_characteristics,revision_history
0,1.0,APAucará LB L001,B001,Bautizo. Domingo. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-04,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
1,2.0,APAucará LB L001,B002,Bautizo. Dominga. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-06,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
2,3.0,APAucará LB L001,B003,Bautizo. Bartola. Tributarios,3r,3r,IMG_7000a,IMG_7000a,Bautizo,1790-10-07,...,,"Pampamarca, iglesia",,Aucara,Pampamarca,,,,Regular,Registrado por Edwin Gonzales en 2023
3,4.0,APAucará LB L001,B004,Bautizo. Francisca,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Abreviatura poco visible en el margen,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023
4,5.0,APAucará LB L001,B005,Bautizo. Pedro,3v,3v,IMG_7000b,IMG_7000b,Bautizo,1790-10-20,...,,"Aucara, iglesia",Margen roto y manchado de tinta,Aucara,,,,,Regular,Registrado por Edwin Gonzales en 2023


### Reduce DataFrames to their relevant columns

In [5]:
# Bautismos useful columns

BAUTISMOS_HARMONIZED = BAUTISMOS_HARMONIZED[
    [
        'date', 'baptized_name', 'hometown', 'birth_date', 'baptized_legitimacy_status',
        'father_name', 'father_lastname', 'father_social_condition', 'mother_name',
        'mother_lastname', 'mother_social_condition', 'parents_social_condition',
        'godfather_name', 'godfather_lastname', 'godfather_social_condition',
        'godmother_name', 'godmother_lastname', 'godmother_social_condition',
        'baptism_place', 'additional_notes', 'geographic_descriptor_1',
        'geographic_descriptor_2', 'geographic_descriptor_3',
        'geographic_descriptor_4', 'other'
    ]
]

# remove empty columns
BAUTISMOS_HARMONIZED.dropna(axis=1, how='all', inplace=True)

BAUTISMOS_HARMONIZED.columns

Index(['date', 'baptized_name', 'hometown', 'birth_date',
       'baptized_legitimacy_status', 'father_name', 'father_lastname',
       'father_social_condition', 'mother_name', 'mother_lastname',
       'mother_social_condition', 'parents_social_condition', 'godfather_name',
       'godfather_lastname', 'godfather_social_condition', 'godmother_name',
       'godmother_lastname', 'godmother_social_condition', 'baptism_place',
       'additional_notes', 'geographic_descriptor_1',
       'geographic_descriptor_2', 'geographic_descriptor_3',
       'geographic_descriptor_4'],
      dtype='object')

In [6]:
MATRIMONIOS_HARMONIZED = MATRIMONIOS_HARMONIZED[
    ['date',
       'groom_name', 'groom_lastname', 'groom_social_condition',
       'groom_marital_status', 'groom_age', 'groom_hometown',
       'groom_resident_in', 'groom_legitimacy_status', 'groom_father_name',
       'groom_father_lastname', 'groom_father_social_condition',
       'groom_mother_name', 'groom_mother_lastname',
       'groom_mother_social_condition', 'bride_name', 'bride_lastname',
       'bride_social_condition', 'bride_marital_status', 'bride_age',
       'bride_hometown', 'bride_resident_in', 'bride_legitimacy_status',
       'bride_father_name', 'bride_father_lastname',
       'bride_father_social_condition', 'bride_mother_name',
       'bride_mother_lastname', 'bride_mother_social_condition',
       'godparent_1_name', 'godparent_1_lastname',
       'godparent_1_social_condition', 'godparent_2_name',
       'godparent_2_lastname', 'godparent_2_social_condition',
       'godparent_3_name', 'godparent_3_lastname',
       'godparent_3_social_condition', 'witness_1_name', 'witness_1_lastname',
       'witness_2_name', 'witness_2_lastname', 'witness_3_name',
       'witness_3_lastname', 'witness_4_name', 'witness_4_lastname',
       'marriage_place', 'geographic_descriptor_1', 'geographic_descriptor_2',
       'geographic_descriptor_3', 'geographic_descriptor_4',
       'geographic_descriptor_5', 'geographic_descriptor_6']
]

# remove empty columns
MATRIMONIOS_HARMONIZED.dropna(axis=1, how='all', inplace=True)

MATRIMONIOS_HARMONIZED.columns

Index(['date', 'groom_name', 'groom_lastname', 'groom_social_condition',
       'groom_marital_status', 'groom_age', 'groom_hometown',
       'groom_resident_in', 'groom_legitimacy_status', 'groom_father_name',
       'groom_father_lastname', 'groom_father_social_condition',
       'groom_mother_name', 'groom_mother_lastname',
       'groom_mother_social_condition', 'bride_name', 'bride_lastname',
       'bride_social_condition', 'bride_marital_status', 'bride_age',
       'bride_hometown', 'bride_resident_in', 'bride_legitimacy_status',
       'bride_father_name', 'bride_father_lastname',
       'bride_father_social_condition', 'bride_mother_name',
       'bride_mother_lastname', 'bride_mother_social_condition',
       'godparent_1_name', 'godparent_1_lastname',
       'godparent_1_social_condition', 'godparent_2_name',
       'godparent_2_lastname', 'godparent_2_social_condition',
       'godparent_3_name', 'godparent_3_lastname', 'witness_1_name',
       'witness_1_lastname', 'witne

In [7]:
ENTIERROS_HARMONIZED = ENTIERROS_HARMONIZED[
    ['date',
       'doctrine', 'place', 'deceased_name',
       'deceased_lastname', 'age', 'hometown', 'condition', 'marital_status',
       'deceased_legitimacy_status', 'father_name', 'father_lastname',
       'mother_name', 'mother_lastname', 'husband_name', 'wife_name',
       'cause_of_death', 'burial_place', 'geographic_descriptor_1',
       'geographic_descriptor_2', 'geographic_descriptor_3',
       'geographic_descriptor_4',]
]

# remove empty columns
ENTIERROS_HARMONIZED.dropna(axis=1, how='all', inplace=True)

ENTIERROS_HARMONIZED.columns

Index(['date', 'doctrine', 'place', 'deceased_name', 'deceased_lastname',
       'age', 'hometown', 'condition', 'marital_status',
       'deceased_legitimacy_status', 'father_name', 'father_lastname',
       'mother_name', 'mother_lastname', 'husband_name', 'wife_name',
       'cause_of_death', 'burial_place', 'geographic_descriptor_1',
       'geographic_descriptor_2', 'geographic_descriptor_3',
       'geographic_descriptor_4'],
      dtype='object')