# Preprocessing of the data

In [3]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfTransformer
import json

# Chargement des donn√©es d'entra√Ænement
with open("./data/NER-TRAINING.jsonlines", 'r') as f:
    training_data_raw = [json.loads(l) for l in list(f)]

training_data = pd.DataFrame(training_data_raw).dropna()

# Chargement des donn√©es de validation
with open("./data/NER-VALIDATION.jsonlines", 'r') as f:
    validation_data_raw = [json.loads(l) for l in list(f)]

validation_data = pd.DataFrame(validation_data_raw).dropna()

# Affichage des premi√®res lignes des donn√©es d'entra√Ænement
print(training_data.head())

# S√©paration des tokens et des √©tiquettes
X_train = training_data.tokens
y_train = training_data.ner_tags
print(X_train.shape, y_train.shape)

   unique_id                                             tokens  \
0       6506  [Later, in, May, of, 2010, within, a, Pakistan...   
1       5221  [In, 2008, ,, Tom, Donahue, ,, a, senior, Cent...   
2       1923  [On, the, spectrum, of, state, responsibility,...   
3       5905  [If, we, observe, the, network, communications...   
4       3114  [The, regime's, CSTIA, relies, on, Russia, as,...   

                                            ner_tags  
0  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
1  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
3  [O, O, O, O, O, O, O, O, O, O, B-Entity, O, B-...  
4      [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]  
(4876,) (4876,)


In [4]:
unique_tags = set(tag for tags_list in training_data["ner_tags"] for tag in tags_list)

# Creation du dictionnaire
tag_to_idx = {index: tag for index, tag in enumerate(unique_tags)}
idx_to_tag = {tag: index for index, tag in enumerate(unique_tags)}
print(idx_to_tag)

training_data["ner_tags_numeric"] = training_data["ner_tags"].apply(lambda tags_list: [idx_to_tag[tag] for tag in tags_list])
validation_data["ner_tags_numeric"] = validation_data["ner_tags"].apply(lambda tags_list: [idx_to_tag[tag] for tag in tags_list])
# Afficher le r√©sultat
print(training_data.head())

{'B-Modifier': 0, 'B-Entity': 1, 'I-Action': 2, 'I-Modifier': 3, 'O': 4, 'I-Entity': 5, 'B-Action': 6}
   unique_id                                             tokens  \
0       6506  [Later, in, May, of, 2010, within, a, Pakistan...   
1       5221  [In, 2008, ,, Tom, Donahue, ,, a, senior, Cent...   
2       1923  [On, the, spectrum, of, state, responsibility,...   
3       5905  [If, we, observe, the, network, communications...   
4       3114  [The, regime's, CSTIA, relies, on, Russia, as,...   

                                            ner_tags  \
0  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...   
1  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...   
2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...   
3  [O, O, O, O, O, O, O, O, O, O, B-Entity, O, B-...   
4      [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]   

                                    ner_tags_numeric  
0  [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...  
1  [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4

In [5]:
from transformers import AutoTokenizer

# Chargement du mod√®le et du tokenizer
bert_model = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_model)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def align_labels_with_tokens(labels, word_ids):
    """
    Aligner les √©tiquettes avec les tokens apr√®s tokenisation.
    """
    aligned_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            aligned_labels.append(label)
        elif word_id is None:
            aligned_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label = label + 1
            aligned_labels.append(label)

    return aligned_labels

In [7]:
def tokenize_and_align_labels(row, tokenizer):
    """
    Tokenise les tokens d'une ligne et aligne les √©tiquettes NER.
    """
    # Tokenisation
    tokenized_input = tokenizer(
        row['tokens'], truncation=True, is_split_into_words=True, padding=True
    )

    # Alignement des √©tiquettes avec les sous-tokens
    word_ids = tokenized_input.word_ids()
    labels = row['ner_tags']
    numeric_labels = row['ner_tags_numeric']
    
    aligned_labels = align_labels_with_tokens(numeric_labels, word_ids)

    # Ajouter les labels align√©s
    tokenized_input['labels'] = aligned_labels

    return tokenized_input

In [8]:
# Application de la fonction sur chaque ligne des donn√©es d'entra√Ænement et de validation
training_data['tokenized'] = training_data.apply(
    lambda row: tokenize_and_align_labels(row, tokenizer), axis=1
)
validation_data['tokenized'] = validation_data.apply(
    lambda row: tokenize_and_align_labels(row, tokenizer), axis=1
)

In [9]:
print(training_data)
print(validation_data['tokenized'])

      unique_id                                             tokens  \
0          6506  [Later, in, May, of, 2010, within, a, Pakistan...   
1          5221  [In, 2008, ,, Tom, Donahue, ,, a, senior, Cent...   
2          1923  [On, the, spectrum, of, state, responsibility,...   
3          5905  [If, we, observe, the, network, communications...   
4          3114  [The, regime's, CSTIA, relies, on, Russia, as,...   
...         ...                                                ...   
4871        654  [For, example, :, The, malware, beacons, locat...   
4872       1739  [The, analysed, code, suggests, that, even, fi...   
4873        892  [The, second, component, of, the, entry, point...   
4874       4274  [APT28, made, at, least, two, specific, attemp...   
4875         48  [Sometimes, ,, both, backdoors, are, run, in, ...   

                                               ner_tags  \
0     [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...   
1     [O, O, O, O, O, O, O, O, O, O, O, O

In [10]:
# Pr√©paration des donn√©es pour l'entra√Ænement
train_data_ready = training_data.copy()
train_data_ready["input_ids"] = train_data_ready["tokenized"].apply(lambda x: x["input_ids"])
train_data_ready["attention_mask"] = train_data_ready["tokenized"].apply(lambda x: x["attention_mask"])
train_data_ready["labels"] = train_data_ready["tokenized"].apply(lambda x: x["labels"])
train_data_ready = train_data_ready.drop(columns=["tokenized", "tokens", "ner_tags", "ner_tags_numeric"])

valid_data_ready = validation_data.copy()
valid_data_ready["input_ids"] = valid_data_ready["tokenized"].apply(lambda x: x["input_ids"])
valid_data_ready["attention_mask"] = valid_data_ready["tokenized"].apply(lambda x: x["attention_mask"])
valid_data_ready["labels"] = valid_data_ready["tokenized"].apply(lambda x: x["labels"])
valid_data_ready = valid_data_ready.drop(columns=["tokenized", "tokens", "ner_tags", "ner_tags_numeric"])

from transformers import DataCollatorForTokenClassification

In [11]:
from datasets import Dataset

# Conversion des DataFrames en datasets Hugging Face
train_dataset = Dataset.from_pandas(train_data_ready)
valid_dataset = Dataset.from_pandas(valid_data_ready)

# Fonction pour paddder et tronquer les s√©quences
def pad_and_truncate(examples, max_length=50):
    padded_examples = {
        "input_ids": [],
        "attention_mask": [],
        "labels": []
    }
    for input_ids, attention_mask, labels in zip(examples["input_ids"], examples["attention_mask"], examples["labels"]):
        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        labels = labels[:max_length]

        input_ids += [0] * (max_length - len(input_ids))
        attention_mask += [0] * (max_length - len(attention_mask))
        labels += [-100] * (max_length - len(labels))

        padded_examples["input_ids"].append(input_ids)
        padded_examples["attention_mask"].append(attention_mask)
        padded_examples["labels"].append(labels)
    
    return padded_examples

# Appliquer la fonction de padding et de troncature
train_dataset = pad_and_truncate(train_dataset)
valid_dataset = pad_and_truncate(valid_dataset)

# Reconvertir en dataset Hugging Face
train_dataset = Dataset.from_dict(train_dataset)
valid_dataset = Dataset.from_dict(valid_dataset)

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)
batch = data_collator([train_dataset[i] for i in range(2)])

In [13]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4876
})

In [14]:
valid_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1044
})

In [15]:
import evaluate

metric = evaluate.load('seqeval')
label_names = [key for key in tag_to_idx.values()]
print(label_names)

labels = training_data['ner_tags_numeric'][1542]
labels = [label_names[i] for i in labels]
print(labels)

['B-Modifier', 'B-Entity', 'I-Action', 'I-Modifier', 'O', 'I-Entity', 'B-Action']
['B-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'I-Entity', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Action', 'O', 'B-Entity', 'I-Entity', 'O']


In [16]:
predictions = labels.copy()
predictions[2] = "I-Entity"

metric.compute(predictions=[predictions], references=[labels])

{'Action': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'Entity': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'overall_precision': np.float64(1.0),
 'overall_recall': np.float64(1.0),
 'overall_f1': np.float64(1.0),
 'overall_accuracy': 1.0}

# Creation of the model

In [17]:
import numpy as np
from sklearn.metrics import classification_report


def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

In [18]:
from transformers import AutoModelForTokenClassification, TrainingArguments

# Chargement du mod√®le pour la classification de tokens
model = AutoModelForTokenClassification.from_pretrained(bert_model, id2label=idx_to_tag, label2id=tag_to_idx)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Configuration des param√®tres d'entra√Ænement
training_args = TrainingArguments(
    output_dir="./bert-base-uncased",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=6,
    weight_decay=0.01
)






In [20]:
from transformers import Trainer

# Cr√©ation du Trainer et d√©marrage de l'entra√Ænement
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [21]:
print(valid_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1044
})


In [22]:
print(train_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4876
})


# Training of the model

In [23]:
trainer.train()

 14%|‚ñà‚ñé        | 500/3660 [22:53<2:07:46,  2.43s/it]

{'loss': 0.5689, 'grad_norm': 14.196165084838867, 'learning_rate': 1.726775956284153e-05, 'epoch': 0.82}


                                                    
 17%|‚ñà‚ñã        | 610/3660 [28:49<1:58:24,  2.33s/it]

{'eval_loss': 0.36765238642692566, 'eval_precision': 0.4633870511733107, 'eval_recall': 0.5393221454425798, 'eval_f1': 0.49847931873479323, 'eval_accuracy': 0.854612622565944, 'eval_runtime': 79.2822, 'eval_samples_per_second': 13.168, 'eval_steps_per_second': 1.652, 'epoch': 1.0}


 27%|‚ñà‚ñà‚ñã       | 1000/3660 [44:25<1:37:04,  2.19s/it]

{'loss': 0.3521, 'grad_norm': 11.602906227111816, 'learning_rate': 1.4535519125683062e-05, 'epoch': 1.64}


                                                     
 33%|‚ñà‚ñà‚ñà‚ñé      | 1220/3660 [53:27<1:20:29,  1.98s/it]

{'eval_loss': 0.3149525821208954, 'eval_precision': 0.522237380627558, 'eval_recall': 0.629812438302073, 'eval_f1': 0.5710023866348447, 'eval_accuracy': 0.875431570225107, 'eval_runtime': 66.6631, 'eval_samples_per_second': 15.661, 'eval_steps_per_second': 1.965, 'epoch': 2.0}


 41%|‚ñà‚ñà‚ñà‚ñà      | 1500/3660 [1:03:30<1:15:57,  2.11s/it]

{'loss': 0.2382, 'grad_norm': 8.841859817504883, 'learning_rate': 1.1803278688524591e-05, 'epoch': 2.46}


                                                       
 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 1830/3660 [1:16:21<58:31,  1.92s/it]

{'eval_loss': 0.3970538377761841, 'eval_precision': 0.5945062132112492, 'eval_recall': 0.5982230997038499, 'eval_f1': 0.5963588650155814, 'eval_accuracy': 0.8841320259632648, 'eval_runtime': 66.7145, 'eval_samples_per_second': 15.649, 'eval_steps_per_second': 1.964, 'epoch': 3.0}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 2000/3660 [1:22:30<58:57,  2.13s/it]   

{'loss': 0.166, 'grad_norm': 0.010062937624752522, 'learning_rate': 9.071038251366122e-06, 'epoch': 3.28}


                                                     
 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2440/3660 [1:39:13<38:57,  1.92s/it]

{'eval_loss': 0.4264163374900818, 'eval_precision': 0.6299804049640758, 'eval_recall': 0.6347482724580454, 'eval_f1': 0.632355351581708, 'eval_accuracy': 0.8893453942825577, 'eval_runtime': 66.7518, 'eval_samples_per_second': 15.64, 'eval_steps_per_second': 1.962, 'epoch': 4.0}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 2500/3660 [1:41:26<40:29,  2.09s/it]  

{'loss': 0.1281, 'grad_norm': 2.2324981689453125, 'learning_rate': 6.338797814207651e-06, 'epoch': 4.1}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 3000/3660 [1:59:08<23:14,  2.11s/it]

{'loss': 0.0844, 'grad_norm': 3.985865354537964, 'learning_rate': 3.6065573770491806e-06, 'epoch': 4.92}


                                                     
 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 3050/3660 [2:01:59<19:25,  1.91s/it]

{'eval_loss': 0.45357272028923035, 'eval_precision': 0.6048506359065365, 'eval_recall': 0.6729187232642316, 'eval_f1': 0.6370716510903426, 'eval_accuracy': 0.8887239331584036, 'eval_runtime': 66.5112, 'eval_samples_per_second': 15.697, 'eval_steps_per_second': 1.97, 'epoch': 5.0}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 3500/3660 [2:18:02<05:38,  2.12s/it]  

{'loss': 0.0666, 'grad_norm': 0.24097102880477905, 'learning_rate': 8.743169398907105e-07, 'epoch': 5.74}


                                                     
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3660/3660 [2:24:53<00:00,  1.92s/it]

{'eval_loss': 0.4937201738357544, 'eval_precision': 0.6195046439628483, 'eval_recall': 0.6584402764067128, 'eval_f1': 0.638379326846387, 'eval_accuracy': 0.8904847396768402, 'eval_runtime': 69.3918, 'eval_samples_per_second': 15.045, 'eval_steps_per_second': 1.888, 'epoch': 6.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3660/3660 [2:24:55<00:00,  2.38s/it]

{'train_runtime': 8695.4791, 'train_samples_per_second': 3.365, 'train_steps_per_second': 0.421, 'train_loss': 0.22160062555406915, 'epoch': 6.0}





TrainOutput(global_step=3660, training_loss=0.22160062555406915, metrics={'train_runtime': 8695.4791, 'train_samples_per_second': 3.365, 'train_steps_per_second': 0.421, 'total_flos': 746566746314400.0, 'train_loss': 0.22160062555406915, 'epoch': 6.0})

In [26]:
# Sauvegarder le mod√®le et le tokenizer
trainer.save_model("./saved_model_bert")
tokenizer.save_pretrained("./saved_model_bert")
trainer.save_state()
trainer.state.save_to_json("./saved_model_bert/trainer_state.json")

# Sauvegarder les arguments d'entra√Ænement
with open("./saved_model_bert/training_args.json", "w") as f:
	f.write(training_args.to_json_string())

# Generation of predictions

In [18]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import json

# Charger le mod√®le et le tokenizer enregistr√©s
model = AutoModelForTokenClassification.from_pretrained("./saved_model_bert")
tokenizer = AutoTokenizer.from_pretrained("./saved_model_bert")

# Charger les donn√©es de test
with open('./data/NER-TESTING.jsonlines', 'r') as f:
    ner_testing = [json.loads(line) for line in f]

# V√©rifier la structure des donn√©es de test
for entry in ner_testing:
    if 'tokens' not in entry:
        raise KeyError("Chaque entr√©e doit contenir la cl√© 'tokens'.")

# Pr√©parer les donn√©es de test
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    tokenized_inputs["word_ids"] = [tokenized_inputs.word_ids(i) for i in range(len(tokenized_inputs["input_ids"]))]
    return tokenized_inputs

# Convertir les donn√©es de test en Dataset
test_dataset = Dataset.from_pandas(pd.DataFrame(ner_testing))
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# D√©finir les arguments d'entra√Ænement
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=32,
)

# Cr√©er le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
)

# Faire des pr√©dictions
predictions, _, _ = trainer.predict(tokenized_test_dataset)

# Convertir les logits en √©tiquettes pr√©dites
predicted_labels = torch.argmax(torch.tensor(predictions), dim=-1).tolist()

# Aligner les √©tiquettes avec les tokens
def align_labels_with_tokens(predicted_labels, word_ids):
    aligned_labels = []
    previous_word_id = None

    for label, word_id in zip(predicted_labels, word_ids):
        if word_id is None or word_id == previous_word_id:
            # Ignore special tokens or sub-tokens
            continue
        aligned_labels.append(label)
        previous_word_id = word_id

    return aligned_labels

# Obtenir le dictionnaire id2label
id2label = model.config.id2label

# Aligner les √©tiquettes pr√©dites avec les tokens et remplacer les nombres par les labels
results = []
for i, entry in enumerate(ner_testing):
    word_ids = tokenized_test_dataset[i]['word_ids']
    aligned_labels = align_labels_with_tokens(predicted_labels[i], word_ids)
    labeled_aligned_labels = [id2label[label] for label in aligned_labels]
    results.append({
        "unique_id": entry["unique_id"],
        "tokens": entry["tokens"],
        "ner_tags": labeled_aligned_labels
    })

# Sauvegarder les r√©sultats dans un fichier JSONLines
with open('ner_results_labeled.jsonlines', 'w') as f:
    for result in results:
        f.write(json.dumps(result) + '\n')

print("Le fichier JSONLines avec les ner_tags a √©t√© sauvegard√© avec succ√®s.")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1046/1046 [00:00<00:00, 7876.15 examples/s]
  trainer = Trainer(
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 33/33 [00:37<00:00,  1.14s/it]


Le fichier JSONLines avec les ner_tags a √©t√© sauvegard√© avec succ√®s.


In [5]:
import json

# Charger les r√©sultats de pr√©diction
predictions = []
with open('./ner_results_labeled.jsonlines', 'r') as f:
    for line in f:
        try:
            predictions.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Erreur de d√©codage JSON √† la ligne: {line}")
            continue

# V√©rifier le nombre de tokens par rapport au nombre de labels et compter les labels diff√©rents de "O"
mismatched_entries = []
label_counts = []

for entry in predictions:
    num_tokens = len(entry['tokens'])
    num_labels = len(entry['ner_tags'])
    
    if num_tokens != num_labels:
        mismatched_entries.append({
            "unique_id": entry["unique_id"],
            "num_tokens": num_tokens,
            "num_labels": num_labels
        })
    
    # Compter les labels diff√©rents de "O"
    num_non_o_labels = sum(1 for label in entry['ner_tags'] if label != "O")
    label_counts.append({
        "unique_id": entry["unique_id"],
        "num_non_o_labels": num_non_o_labels
    })

# Afficher les r√©sultats
if mismatched_entries:
    print("Les entr√©es suivantes ont un nombre de tokens diff√©rent du nombre de labels :")
    for entry in mismatched_entries:
        print(f"Unique ID: {entry['unique_id']}, Tokens: {entry['num_tokens']}, Labels: {entry['num_labels']}")
else:
    print("Tous les r√©sultats ont un nombre de tokens √©gal au nombre de labels.")

# Afficher le nombre de labels diff√©rents de 'O'
print("\nNombre de labels diff√©rents de 'O' pour chaque entr√©e :")
for entry in label_counts:
    print(f"Unique ID: {entry['unique_id']}, Non-O Labels: {entry['num_non_o_labels']}")

Tous les r√©sultats ont un nombre de tokens √©gal au nombre de labels.

Nombre de labels diff√©rents de 'O' pour chaque entr√©e :
Unique ID: 1357, Non-O Labels: 0
Unique ID: 3016, Non-O Labels: 0
Unique ID: 6936, Non-O Labels: 23
Unique ID: 4538, Non-O Labels: 11
Unique ID: 4327, Non-O Labels: 0
Unique ID: 6622, Non-O Labels: 0
Unique ID: 3090, Non-O Labels: 0
Unique ID: 3299, Non-O Labels: 0
Unique ID: 5323, Non-O Labels: 0
Unique ID: 3431, Non-O Labels: 0
Unique ID: 4108, Non-O Labels: 0
Unique ID: 3784, Non-O Labels: 0
Unique ID: 1452, Non-O Labels: 12
Unique ID: 5593, Non-O Labels: 0
Unique ID: 5769, Non-O Labels: 0
Unique ID: 881, Non-O Labels: 0
Unique ID: 2360, Non-O Labels: 0
Unique ID: 322, Non-O Labels: 0
Unique ID: 347, Non-O Labels: 5
Unique ID: 6494, Non-O Labels: 0
Unique ID: 1465, Non-O Labels: 0
Unique ID: 1937, Non-O Labels: 0
Unique ID: 464, Non-O Labels: 12
Unique ID: 6561, Non-O Labels: 0
Unique ID: 4915, Non-O Labels: 0
Unique ID: 5582, Non-O Labels: 0
Unique ID: 1

In [6]:
# Afficher les premi√®res entr√©es
print(predictions[:5])

[{'unique_id': 1357, 'tokens': ['Stage', '3', 'exports', 'hundreds', 'of', 'methods', ',', 'organized', 'into', '12', 'different', 'major', 'groups', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'unique_id': 3016, 'tokens': ['These', 'campaigns', 'leverage', 'the', 'phenomenon', 'of', 'viral', ',', 'unverified', 'news', 'stories', 'that', 'tend', 'to', 'rapidly', 'propagate', 'via', 'social', 'media', ',', 'mobile', 'text', 'messaging', ',', 'and', 'other', 'electronic', 'communications', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'unique_id': 6936, 'tokens': ['Interestingly', ',', 'most', 'of', 'the', 'affected', 'victims', 'have', 'another', 'thing', 'in', 'common', '‚Äì', 'a', 'number', 'of', 'other', 'RATs', ',', 'file', 'stealing', 'trojans', 'or', 'keyloggers', 'were', 'detected', 'on', 'their', 'systems', 'on',

In [None]:
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from seqeval.scheme import IOB2
import numpy as np
import json

def pretty_print_dict(d, indent):
    res = ""
    for k, v in d.items():
        res += "\t"*indent + str(k) + "\n"
        if isinstance(v, dict):
            res += pretty_print_dict(v, indent+1)
        else:
            res += "\t"*(indent+1) + str(v) + "\n"
    print(res)
    return res
    
def compute_seqeval_jsonl(references_jsonl, predictions_jsonl, ref_col='ner_tags', pred_col='pred_ner_tags'):
    '''
    Computes the seqeval scores between two datasets loaded from jsonl (list of dicts with same keys).
    Sorts the datasets by 'unique_id' and verifies that the tokens match.
    '''
    # extract the tags and reverse the dict
    ref_dict = {k:[e[k] for e in references_jsonl] for k in references_jsonl[0].keys()}
    pred_dict = {k:[e[k] for e in predictions_jsonl] for k in predictions_jsonl[0].keys()}
        
    # sort by unique_id
    ref_idx = np.argsort(ref_dict['unique_id'])
    pred_idx = np.argsort(pred_dict['unique_id'])
    ref_ner_tags = np.array(ref_dict[ref_col], dtype=object)[ref_idx]
    pred_ner_tags = np.array(pred_dict[pred_col], dtype=object)[pred_idx]
    ref_tokens = np.array(ref_dict['tokens'], dtype=object)[ref_idx]
    pred_tokens = np.array(pred_dict['tokens'], dtype=object)[pred_idx]

    # check that tokens match
    #assert((ref_tokens==pred_tokens).all())
    
    
    # get report
    report = classification_report(y_true=ref_ner_tags, y_pred=pred_ner_tags, 
                                   scheme=IOB2, output_dict=True,
                                  )
    
    # extract values we care about
    report.pop("macro avg")
    report.pop("weighted avg")
    overall_score = report.pop("micro avg")

    seqeval_results = {
        type_name: {
            "precision": score["precision"],
            "recall": score["recall"],
            "f1": score["f1-score"],
            "suport": score["support"],
        }
        for type_name, score in report.items()
    }
    seqeval_results["overall_precision"] = overall_score["precision"]
    seqeval_results["overall_recall"] = overall_score["recall"]
    seqeval_results["overall_f1"] = overall_score["f1-score"]
    seqeval_results["overall_accuracy"] = accuracy_score(y_true=ref_ner_tags, y_pred=pred_ner_tags)    
    
    return(seqeval_results)


if __name__ == '__main__':

    # Pour les √©tudiants : indiquer le chemin vers le fichier NER-VALIDATION
    with open("./data/NER-VALIDATION.jsonlines", 'r') as f:
        references_jsonl = [json.loads(l) for l in list(f)]

    # Pour les √©tudiants : indiquer ici le chemin vers votre fichier de pr√©diction sur le jeu de validation
    with open("./ner_results_labeled.jsonlines", 'r') as f:
        pred_jsonl = [json.loads(l) for l in list(f)]


    res = compute_seqeval_jsonl(references_jsonl, pred_jsonl, ref_col = 'ner_tags', pred_col='ner_tags')
    pretty_print_dict(res, 0)


KeyError: 'ner_tags'