In [1]:
import json
import yaml
import pandas as pd
from tqdm import tqdm
import re
import numpy as np

pd.set_option('future.no_silent_downcasting', True)

## Load data and tokens

In [2]:
with open('data/entities.json', 'r') as f:
    data = json.load(f)

In [3]:
with open('data/tokens.yml', 'r') as f:
    tokens = yaml.safe_load(f)
    
token_to_col = {}
for col in tokens.keys():
    token = tokens[col]['start']
    token_to_col[token] = col

## Preprocessing

In [4]:
def blank_dict(tokens=tokens):
    d = {}
    for col in list(tokens.keys()):
        d[col] = None
    return d

def split_by_token(line, token_to_col=token_to_col):
    tokens_to_split = list(token_to_col.keys())
    pattern = '(' + '|'.join(re.escape(token) for token in tokens_to_split) + ')'
    
    splits = re.split(pattern, line)
    # Filter out empty strings and trim kept strings
    splits = [part.strip() for part in splits if part]
    return splits

def split_to_dict(split, token_to_col=token_to_col, dict_split=None):
    if dict_split is None:
        dict_split = blank_dict()
    for i in range(0, len(split), 2):
        token = split[i]
        element = split[i+1]
        column = token_to_col[token]
        dict_split[column] = element
    return dict_split

In [5]:
df_dict = {}
counter = 0
for key in tqdm(data.keys()): 
    for line in data[key].split('\n'):
        try:
            split = split_by_token(line)
            split_dict = split_to_dict(split)
            df_dict[counter] = split_dict
                    
            counter += 1
        except:
            pass

100%|██████████| 1218/1218 [00:00<00:00, 3295.41it/s]


In [6]:
df = pd.DataFrame().from_dict(df_dict, orient='index').fillna(value=np.nan)

indices_to_remove = []
for i in range(len(df)):
    if np.all(df.iloc[i].isna()):
        indices_to_remove.append(i)

df = df.loc[~df.index.isin(indices_to_remove)]

In [7]:
X = df.drop(columns=['surname_household'])
y = df['surname_household'].apply(
                                lambda x: 0 if pd.isna(x) else 1
                            )

In [8]:
X

Unnamed: 0,age,birth_date,civil_status,education_level,employer,firstname,link,lob,maiden_name,nationality,observation,occupation,surname
0,25,,Garçon,,,Cyrille,,,,française,,menuisier,Breton
1,30,,Garçon,,,Auguste,,,,Piémontaise,,vitrier,
2,24,,Garçon,,,Pierre,,,,Piémontaise,,vitrier,
3,48,,Homme marié,,,Alexandre,,,,française,,prop re,
4,30,,,,,Zélie,sa fe,,,française,,prop re,Vignat
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25433,,1869,,,,Marie,chef,Pailharès,,idem,,,
25434,,1863,,,Cara,Marie,chef,St Naz en Royans,,idem,,ouv chaus res,
25435,,1886,,,Baretto,Nello,chef,Castel,,italienne,,manoeuvre,
25436,,1887,,,,Annunziata,épouse,idem,,idem,,,Berni-Laureti


## Gradient Boosing

In [25]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import log_loss

#### Ordinal Encoding

In [10]:
enc = OrdinalEncoder()
X_encoded = enc.fit_transform(X)

In [21]:
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y, test_size=0.20, stratify=y)

### Gradient Boosting

In [26]:
param_grid = {
    "learning_rate": [1e-1, 1e-2, 1e-3],
    "max_leaf_nodes": [15, 31, 100],
    "max_depth": [None, 5, 10, 20],
    "min_samples_leaf": [10, 20, 50],
    "l2_regularization": [0.0, 0.1, 1.0]
}

model = HistGradientBoostingClassifier(max_iter=10_000, early_stopping=True, class_weight='balanced', validation_fraction=0.2)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=10)

grid_search.fit(X_train_enc, y_train_enc)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV 1/3; 7/324] START l2_regularization=0.0, learning_rate=0.1, max_depth=None, max_leaf_nodes=100, min_samples_leaf=10
[CV 1/3; 7/324] END l2_regularization=0.0, learning_rate=0.1, max_depth=None, max_leaf_nodes=100, min_samples_leaf=10;, score=0.995 total time=   0.8s
[CV 1/3; 21/324] START l2_regularization=0.0, learning_rate=0.1, max_depth=10, max_leaf_nodes=15, min_samples_leaf=50
[CV 1/3; 21/324] END l2_regularization=0.0, learning_rate=0.1, max_depth=10, max_leaf_nodes=15, min_samples_leaf=50;, score=0.996 total time=   0.8s
[CV 2/3; 36/324] START l2_regularization=0.0, learning_rate=0.1, max_depth=20, max_leaf_nodes=100, min_samples_leaf=50
[CV 2/3; 36/324] END l2_regularization=0.0, learning_rate=0.1, max_depth=20, max_leaf_nodes=100, min_samples_leaf=50;, score=0.996 total time=   0.9s
[CV 3/3; 45/324] START l2_regularization=0.0, learning_rate=0.01, max_depth=None, max_leaf_nodes=100, min_samples_leaf=50
[CV 3/3;

In [32]:
# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

Best parameters: {'l2_regularization': 0.0, 'learning_rate': 0.01, 'max_depth': None, 'max_leaf_nodes': 100, 'min_samples_leaf': 20}


In [27]:
# Get the best model from the grid search
clf = grid_search.best_estimator_
clf.fit(X_train_enc, y_train_enc)

In [28]:
#clf = HistGradientBoostingClassifier(learning_rate=5e-3, max_leaf_nodes=100, max_iter=10_000, early_stopping=True, class_weight='balanced', validation_fraction=0.2)

In [29]:
print(f"Train accuracy: {clf.score(X_train_enc, y_train_enc):.3f}")
print(f"Test accuracy: {clf.score(X_test_enc, y_test_enc):.3f}")

Train accuracy: 0.999
Test accuracy: 0.997


In [30]:
print(f'Train CE loss = {log_loss(y_train_enc, clf.predict_proba(X_train_enc)):.4f}')
print(f'Test CE loss = {log_loss(y_test_enc, clf.predict_proba(X_test_enc)):.4f}')

Train CE loss = 0.0028
Test CE loss = 0.0103


## Encode through pre-trained model

In [71]:
from transformers import BartTokenizer, BartModel

In [72]:
X_str = np.array([' '.join([str(x) for x in X.iloc[i].dropna(inplace=False).values]) for i in range(len(X))], dtype=str)

In [73]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartModel.from_pretrained('facebook/bart-base')

In [74]:
inputs

{'input_ids': tensor([[    0,  1244,  4974,  3381,   261, 22055,  4061,  6664,   260, 19393,
          1496,  5765,   354,   906,  5811,  1054,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [87]:
inputs = tokenizer(X_str[10], return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [88]:
last_hidden_states.cpu().detach().numpy().squeeze().shape

(21, 768)

In [86]:
a.shape

(17, 768)