In [102]:
import json
import yaml
import pandas as pd
from tqdm import tqdm
import re
import numpy as np

## Load data and tokens

In [15]:
with open('data/entities.json', 'r') as f:
    data = json.load(f)

In [65]:
with open('data/tokens.yml', 'r') as f:
    tokens = yaml.safe_load(f)
    
token_to_col = {}
for col in tokens.keys():
    token = tokens[col]['start']
    token_to_col[token] = col

## Preprocessing

In [70]:
def blank_dict(tokens=tokens):
    d = {}
    for col in list(tokens.keys()):
        d[col] = None
    return d

def split_by_token(line, token_to_col=token_to_col):
    tokens_to_split = list(token_to_col.keys())
    pattern = '(' + '|'.join(re.escape(token) for token in tokens_to_split) + ')'
    
    splits = re.split(pattern, line)
    # Filter out empty strings and trim kept strings
    splits = [part.strip() for part in splits if part]
    return splits

def split_to_dict(split, token_to_col=token_to_col, dict_split=None):
    if dict_split is None:
        dict_split = blank_dict()
    for i in range(0, len(split), 2):
        token = split[i]
        element = split[i+1]
        column = token_to_col[token]
        dict_split[column] = element
    return dict_split

In [89]:
df_dict = {}
counter = 0
for key in tqdm(data.keys()): 
    for line in data[key].split('\n'):
        try:
            split = split_by_token(line)
            split_dict = split_to_dict(split)
            df_dict[counter] = split_dict
                    
            counter += 1
        except:
            pass

100%|██████████| 1218/1218 [00:00<00:00, 2578.85it/s]


In [107]:
X = pd.DataFrame().from_dict(df_dict, orient='index').drop(columns=['surname_household']).fillna(value=np.nan)
y = pd.DataFrame().from_dict(df_dict, orient='index')['surname_household'].apply(
                                lambda x: 1 if x is not None else 0
                            )

In [112]:
X

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25436,,1887,,,,Annunziata,épouse,idem,,idem,,,Berni-Laureti
25437,,1914,,,,Primo,fils,idem,,idem,,,Berni
25438,,,,,,,,,,,,,
25439,,,,,,,,,,,,,


## Classification