## Install libraries

In [None]:
!pip install spacy
!python -m spacy download fr_core_news_sm
!pip install pandas
!pip install scikit-learn

## Import libraries

In [2]:
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.training.example import Example
import os

## Load spaCy model and add NER labels

In [3]:
nlp = spacy.load('fr_core_news_sm')
ner = nlp.get_pipe('ner')

In [4]:
label_names = ["departure", "arrival", "transit", "departure_day", "departure_date", "departure_time"]

for label_name in label_names:
    ner.add_label(label_name)

## Load and prepare data

In [5]:
df = pd.read_json("../data/transport_french.json")

# Split the data into training and testing sets
df_data, df_data_test = train_test_split(df, test_size=0.2, random_state=42)
df.head()

Unnamed: 0,transit,departure_day,departure_time,departure_date,sentence,departure,arrival
0,Beaugency,,,,Je mets le cap sur Tieffenbach-Struth en parta...,Paris-St-Lazare,Tieffenbach-Struth
1,,mardi,19h39,1 avril,Est-ce possible d'aller de Lafarge à Sannois l...,Lafarge,Sannois
2,Les Sables-d'Olonne,,,,Mon itinéraire prévu est de Trouville-Deauvill...,Trouville-Deauville,Argentan
3,Écouflant,,,,Je traverse Écouflant en me rendant de Gunsbac...,Gunsbach-Griesbach,Ugine
4,Le Toec,dimanche,23h37,25 mai,Je planifie de démarrer de Portets le dimanch...,Portets,Nurieux


## Prepare training data

In [6]:
train_data = []

for index, row in df_data.iterrows():
    sentence = row["sentence"].lower()

    entities = [
        ("departure", row["departure"]),
        ("arrival", row["arrival"]),
        ("transit", row["transit"]),
        ("departure_day", row["departure_day"]),
        ("departure_date", row["departure_date"]),
        ("departure_time", row["departure_time"]),
    ]

    annotations = {"entities": []}

    for entity_label, entity_text in entities:
        if entity_text:
            start_pos = sentence.find(entity_text.lower())
            end_pos = start_pos + len(entity_text)
            if not any(start <= start_pos < end or start < end_pos <= end for start, end, _ in annotations["entities"]):
                annotations["entities"].append((start_pos, end_pos, entity_label))

    train_data.append((sentence, annotations))

train_data[0]

('mon itinéraire prévu est de bellegarde à dreuil-lès-amiens, avec un arrêt à givors.',
 {'entities': [(28, 38, 'departure'),
   (41, 58, 'arrival'),
   (76, 82, 'transit')]})

## Train model

In [7]:
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    nlp.update([example], drop=0.5)

In [8]:
model_path = "../model/ner_transport_model"

if not os.path.exists(model_path):
    nlp.to_disk(model_path)

## Evaluation

In [16]:
my_nlp = spacy.load("../model/ner_transport_model")

correct_predictions = 0
total_predictions = len(df_data_test)

for _, row in df_data_test.iterrows():
    my_doc = my_nlp(row["sentence"])

    is_prediction_true = all(
        ent is None or getattr(ent, 'text', '').lower() == str(row[ent.label_]).lower()
        for ent in my_doc.ents
    )

    correct_predictions += is_prediction_true

accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 94.40%


## Inference

In [18]:
my_nlp = spacy.load("../model/ner_transport_model")

text = "le mercredi 27 novembre, je pars de paris pour marseille à 9h, en passant par lyon"

# Process sample text with the trained model
doc = my_nlp(text)

# Display named entities in the processed text
for ent in doc.ents:
    print(f"Entity : {ent.text}, Label : {ent.label_}")

Entity : mercredi, Label : departure_day
Entity : 27 novembre, Label : departure_date
Entity : paris, Label : departure
Entity : marseille, Label : arrival
Entity : 9h, Label : departure_time
Entity : lyon, Label : transit
