In [98]:
import json
from transformers import PreTrainedTokenizerFast, CamembertForTokenClassification, AutoModelForTokenClassification
import yaml
import re
from datasets import load_dataset, load_metric,Dataset
import datasets
from sklearn.model_selection import train_test_split
import importlib
import ast
import pandas as pd

In [29]:
with open('tokens.yml') as f:
    tokens = yaml.load(f, Loader=yaml.FullLoader)


def get_labels(tokens):
    tags = {}
    for token in tokens:
        tags[tokens[token]['start']] = token
    return tags

tag2literal = get_labels(tokens)

literal2tag=[]
for t in tag2literal.keys():
    literal2tag.append(tag2literal[t])


In [30]:
tag2literal

{'Ⓐ': 'age',
 'Ⓑ': 'birth_date',
 'Ⓒ': 'civil_status',
 'Ⓓ': 'education_level',
 'Ⓔ': 'employer',
 'Ⓕ': 'firstname',
 'Ⓗ': 'link',
 'Ⓘ': 'lob',
 'Ⓙ': 'maiden_name',
 'Ⓚ': 'nationality',
 'Ⓛ': 'observation',
 'Ⓜ': 'occupation',
 'Ⓞ': 'surname',
 'Ⓟ': 'surname_household'}

In [31]:
literal2tag

['age',
 'birth_date',
 'civil_status',
 'education_level',
 'employer',
 'firstname',
 'link',
 'lob',
 'maiden_name',
 'nationality',
 'observation',
 'occupation',
 'surname',
 'surname_household']

In [145]:
df=pd.read_csv('data.csv')

In [147]:
dataset=load_dataset('csv',data_files='data.csv')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [148]:
features=datasets.Features(
                {
                    "text": datasets.Sequence(datasets.Value("string")),
                    "labels": datasets.Sequence(
                        datasets.features.ClassLabel(
                            num_classes=len(literal2tag),
                            names=literal2tag,
                        )
                    ),  
                    
                }
            )

In [149]:
features

{'text': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'labels': Sequence(feature=ClassLabel(names=['age', 'birth_date', 'civil_status', 'education_level', 'employer', 'firstname', 'link', 'lob', 'maiden_name', 'nationality', 'observation', 'occupation', 'surname', 'surname_household'], id=None), length=-1, id=None)}

In [150]:
dataset = dataset['train'].map(lambda example: {"text": ast.literal_eval(example["text"]), "labels": ast.literal_eval(example["labels"])}, features=features)

Map:   0%|          | 0/23054 [00:00<?, ? examples/s]

In [151]:
dataset=dataset.train_test_split(test_size=0.2)

In [152]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 18443
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 4611
    })
})

In [39]:
tokenizer=PreTrainedTokenizerFast.from_pretrained('distilbert-base-uncased')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [40]:
tokenizer_french=PreTrainedTokenizerFast.from_pretrained('camembert-base')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'CamembertTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [41]:
tokenizer_french.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [21]:
label_all_tokens = False
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], padding=True,truncation=True, is_split_into_words=True,return_tensors='pt')
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
dataset['train'][0:1]

{'text': [['Guyot', 'Pierre', 'cultivateur']], 'labels': [[13, 5, 11]]}

In [22]:
tokenized_datasets=dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/20059 [00:00<?, ? examples/s]

Map:   0%|          | 0/5015 [00:00<?, ? examples/s]

In [24]:
tokenized_datasets['train'][0:3]

{'text': [['Lothon', 'Clarisse Julie', 'épouse', '51'],
  ['Bessé', 'André', 'idem', '1901', 'française'],
  ['Vigreux', 'Louise', 'ép']],
 'labels': [[-100,
   12,
   -100,
   5,
   -100,
   -100,
   -100,
   6,
   -100,
   0,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100],
  [-100,
   12,
   -100,
   5,
   6,
   -100,
   1,
   9,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100],
  [-100,
   12,
   -100,
   -100,
   5,
   6,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100,
   -100]],
 'input_ids': [[101,
   2843,
   8747,
   18856,
   23061,
   3366,
   7628,
   4958,
   15441,
   4868,
   102,
   0,
   0,

In [25]:
model=AutoModelForTokenClassification.from_pretrained('camembert-base', num_labels=14)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20059
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5015
    })
})

In [27]:
model(input_ids=tokenized_datasets['train']['input_ids'][0:1],attention_mask=tokenized_datasets['train']['attention_mask'][0:1],token_type_ids=tokenized_datasets['train']['token_type_ids'][0:1])

AttributeError: 'list' object has no attribute 'size'