In [None]:
!pip install spacy
!python -m spacy download fr_core_news_sm
!pip install pandas
!pip install scikit-learn

## Import libraries

In [5]:
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.training.example import Example
import os

## Load spaCy model and add NER labels

In [6]:
nlp = spacy.load('fr_core_news_sm')
ner = nlp.get_pipe('ner')

In [7]:
ner.add_label("departure")
ner.add_label("arrival")
ner.add_label("transit")
ner.add_label("departure_time")

1

## Load and prepare data

In [9]:
df = pd.read_json("../data/transport_french.json")

# Split the data into training and testing sets
df_data, df_data_test = train_test_split(df, test_size=0.2, random_state=42)
df.head()

Unnamed: 0,transit,departure_time,sentence,departure,arrival
0,Melun,mercredi,Je réserve un billet de St-Thégonnec à St-Andr...,St-Thégonnec,St-André-de-Cubzac
1,Mézin,samedi,Je traverse Mézin en me rendant de Courtenay à...,Courtenay,Ste-Eulalie-Carbon-Blanc
2,Aimargues,,Je cherche un itinéraire de Rougebarre à Clell...,Rougebarre,Clelles-Mens
3,,dimanche,Je me rends dimanche de Mézy à Lille-St-Sauveur.,Mézy,Lille-St-Sauveur
4,Feuquières-Fressenneville,,"Tournon-St-Martin est mon point de départ, je ...",Tournon-St-Martin,Turckheim


## Prepare training data

In [10]:
train_data = []

for index, row in df_data.iterrows():
    sentence = row["sentence"]
    departure = row["departure"]
    arrival = row["arrival"]
    transit = row["transit"]
    departure_time = row["departure_time"]

    annotations = {"entities": []}

    # Define annotations for named entities
    if departure:
        start_pos = sentence.find(departure)
        end_pos = start_pos + len(departure)
        annotations["entities"].append((start_pos, end_pos, "departure"))

    if arrival:
        start_pos = sentence.find(arrival)
        end_pos = start_pos + len(arrival)
        # Check for overlap with existing entities
        if not any(start <= start_pos < end or start < end_pos <= end for start, end, _ in annotations["entities"]):
            annotations["entities"].append((start_pos, end_pos, "arrival"))

    if transit:
        start_pos = sentence.find(transit)
        end_pos = start_pos + len(transit)
        if not any(start <= start_pos < end or start < end_pos <= end for start, end, _ in annotations["entities"]):
            annotations["entities"].append((start_pos, end_pos, "transit"))

    if departure_time:
        start_pos = sentence.find(departure_time)
        end_pos = start_pos + len(departure_time)
        annotations["entities"].append((start_pos, end_pos, "departure_time"))

    train_data.append((sentence, annotations))

train_data[0]

('Je me rends de Pierre-Buffière à La Joux.',
 {'entities': [(15, 30, 'departure'), (33, 40, 'arrival')]})

## Train model

In [11]:
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    nlp.update([example], drop=0.5)

In [16]:
model_path = "../model/ner_transport_model"

if not os.path.exists(model_path):
    nlp.to_disk(model_path)

## Evaluation

In [22]:
my_nlp = spacy.load("../model/ner_transport_model")

correct_predictions = 0
total_predictions = 0

for index, row in df_data_test.iterrows():
    my_doc = my_nlp(row["sentence"])
    isPredictionTrue = True
    for ent in my_doc.ents:
        if ent.label_ == "departure" and ent.text != row["departure"]:
            isPredictionTrue = False
        elif ent.label_ == "arrival" and ent.text != row["arrival"]:
            isPredictionTrue = False
        elif ent.label_ == "transit" and ent.text != row["transit"]:
            isPredictionTrue = False
        elif ent.label_ == "departure_time" and ent.text != row["departure_time"]:
            isPredictionTrue = False
    if isPredictionTrue:
        correct_predictions += 1

    total_predictions += 1

accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.92


## Inference

In [25]:
my_nlp = spacy.load("../model/ner_transport_model")

text = "en passant par Lyon, je pars de Paris le 27 novembre à 9h pour Marseille"

# Process sample text with the trained model
doc = my_nlp(text)

# Display named entities in the processed text
for ent in doc.ents:
    print(f"Entity : {ent.text}, Label : {ent.label_}")

Entity : Lyon, Label : transit
Entity : Paris, Label : departure
Entity : Marseille, Label : arrival
Entity : 27 novembre à 9h, Label : departure_time
