In [1]:
%%capture capt
!pip install -r requirements.txt

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output
import pandas as pd
from tqdm import tqdm

import json
import yaml
import re

pd.set_option('future.no_silent_downcasting', True)

# DATA

## Load data and tokens

In [3]:
with open('../data/entities.json', 'r') as f:
    data = json.load(f)

In [4]:
with open('../data/tokens.yml', 'r') as f:
    tokens = yaml.safe_load(f)
    
token_to_col = {}
for col in tokens.keys():
    token = tokens[col]['start']
    token_to_col[token] = col

## Preprocessing

In [5]:
def blank_dict(tokens=tokens):
    d = {}
    for col in list(tokens.keys()):
        d[col] = None
    return d

def split_by_token(line, token_to_col=token_to_col):
    tokens_to_split = list(token_to_col.keys())
    pattern = '(' + '|'.join(re.escape(token) for token in tokens_to_split) + ')'
    
    splits = re.split(pattern, line)
    # Filter out empty strings and trim kept strings
    splits = [part.strip() for part in splits if part]
    return splits

def split_to_dict(split, token_to_col=token_to_col, dict_split=None):
    if dict_split is None:
        dict_split = blank_dict()
    for i in range(0, len(split), 2):
        token = split[i]
        element = split[i+1]
        column = token_to_col[token]
        dict_split[column] = element
    return dict_split

In [6]:
df_dict = {}
counter = 0
for key in tqdm(data.keys()): 
    for line in data[key].split('\n'):
        try:
            split = split_by_token(line)
            split_dict = split_to_dict(split)
            df_dict[counter] = split_dict
                    
            counter += 1
        except:
            pass

100%|██████████| 1218/1218 [00:00<00:00, 1909.05it/s]


In [7]:
df = pd.DataFrame().from_dict(df_dict, orient='index').fillna(value=np.nan)

indices_to_remove = []
for i in range(len(df)):
    if np.all(df.iloc[i].isna()):
        indices_to_remove.append(i)

df = df.loc[~df.index.isin(indices_to_remove)]

In [8]:
X = df.drop(columns=['surname_household'])
y = df['surname_household'].apply(
                                lambda x: 0 if pd.isna(x) else 1
                            )

In [11]:
X.head(10).to_latex()

'\\begin{tabular}{llllllllllllll}\n\\toprule\n & age & birth_date & civil_status & education_level & employer & firstname & link & lob & maiden_name & nationality & observation & occupation & surname \\\\\n\\midrule\n0 & 25 & NaN & Garçon & NaN & NaN & Cyrille & NaN & NaN & NaN & française & NaN & menuisier & Breton \\\\\n1 & 30 & NaN & Garçon & NaN & NaN & Auguste & NaN & NaN & NaN & Piémontaise & NaN & vitrier & NaN \\\\\n2 & 24 & NaN & Garçon & NaN & NaN & Pierre & NaN & NaN & NaN & Piémontaise & NaN & vitrier & NaN \\\\\n3 & 48 & NaN & Homme marié & NaN & NaN & Alexandre & NaN & NaN & NaN & française & NaN & prop re & NaN \\\\\n4 & 30 & NaN & NaN & NaN & NaN & Zélie & sa fe & NaN & NaN & française & NaN & prop re & Vignat \\\\\n5 & 24 & NaN & Fille & NaN & NaN & Caroline & NaN & NaN & NaN & française & NaN & domestique & Houy \\\\\n6 & 24 & NaN & Fille & NaN & NaN & Esther & NaN & NaN & NaN & française & NaN & fe de chambre & Violet \\\\\n7 & 46 & NaN & Garçon & NaN & NaN & Françoi