In [1]:
import os
import numpy as np
import pandas as pd
import sklearn
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

def get_text_file(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        return f.read()
    
def array_sentences(paragraph):
    sentences = []
    sentences_clean = []

    while paragraph.find('\n') != -1:
        index = paragraph.find('\n')
        sentences.append(paragraph[:index+1])
        paragraph = paragraph[index+1:]

    for sentence in sentences:
        sentence = sentence.replace('\n','').strip()
        sentences_clean.append(sentence)

    return sentences_clean

file_content = pd.read_fwf('text.txt', header=None)
print(file_content)
# file_content = get_text_file('text.txt')
# sentences = array_sentences(file_content)
# print(sentences)

In [2]:

# Define the model repo
model_name = "Jean-Baptiste/camembert-ner-with-dates" 

tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    do_lower_case=True, 
    is_split_into_words=True)
# 'do_lower_case' pour qu'on passe tout en miniscule

model = AutoModelForTokenClassification.from_pretrained(
    model_name
    )

model.eval()


CamembertForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNo

In [3]:
# Pipelines
ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
en_to_fr = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
classifier = pipeline("zero-shot-classification", model="BaptisteDoyen/camembert-base-xnli")

In [6]:
texts = file_content[0]  

for text in texts:
  print(text)
  nes = ner(text)

  for ne in nes: 
    print(ne)
    
  print("")


Je souhaite aller à Bordeaux le 10 novembre.
{'entity_group': 'LOC', 'score': 0.98633105, 'word': 'Bordeaux', 'start': 19, 'end': 28}
{'entity_group': 'DATE', 'score': 0.99520713, 'word': 'le 10 novembre', 'start': 28, 'end': 43}

Je cherche des trains pour aller à Perpignan le 14 Novembre à partir de 18h.
{'entity_group': 'LOC', 'score': 0.99137115, 'word': 'Perpignan', 'start': 34, 'end': 44}
{'entity_group': 'DATE', 'score': 0.9951699, 'word': 'le 14 Novembre', 'start': 44, 'end': 59}
{'entity_group': 'DATE', 'score': 0.9930231, 'word': '18h', 'start': 71, 'end': 75}

J'ai besoin d'arriver à Marseille le 25/11/2021 avant 12h.
{'entity_group': 'LOC', 'score': 0.9871303, 'word': 'Marseille', 'start': 23, 'end': 33}
{'entity_group': 'DATE', 'score': 0.9949444, 'word': 'le 25/11/2021', 'start': 33, 'end': 47}
{'entity_group': 'DATE', 'score': 0.9860611, 'word': '12h', 'start': 53, 'end': 57}

Je souhaite trouver des trains pour aller à Cannes le 31/11/2021 en passant par Marseille.
{'en