In [None]:
!pip install spacy
!python -m spacy download fr_core_news_sm
!pip install pandas
!pip install scikit-learn

## Import libraries

In [2]:
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.training.example import Example
import os

## Load spaCy model and add NER labels

In [3]:
nlp = spacy.load('fr_core_news_sm')
ner = nlp.get_pipe('ner')

In [4]:
ner.add_label("departure")
ner.add_label("arrival")
ner.add_label("transit")
ner.add_label("departure_day")
ner.add_label("departure_date")
ner.add_label("departure_time")

1

## Load and prepare data

In [9]:
df = pd.read_json("../data/transport_french.json")

# Split the data into training and testing sets
df_data, df_data_test = train_test_split(df, test_size=0.2, random_state=42)
df.head()

Unnamed: 0,transit,departure_day,departure_time,departure_date,sentence,departure,arrival
0,Beaugency,,,,Je mets le cap sur Tieffenbach-Struth en parta...,Paris-St-Lazare,Tieffenbach-Struth
1,,mardi,19h 39,1 avril,Est-ce possible d'aller de Lafarge à Sannois l...,Lafarge,Sannois
2,Les Sables-d'Olonne,,,,Mon itinéraire prévu est de Trouville-Deauvill...,Trouville-Deauville,Argentan
3,Écouflant,,,,Je traverse Écouflant en me rendant de Gunsbac...,Gunsbach-Griesbach,Ugine
4,Le Toec,dimanche,23h 37,25 mai,Je planifie de démarrer de Portets le dimanch...,Portets,Nurieux


## Prepare training data

In [10]:
train_data = []

for index, row in df_data.iterrows():
    sentence = row["sentence"]
    departure = row["departure"]
    arrival = row["arrival"]
    transit = row["transit"]
    departure_day = row["departure_day"]
    departure_date = row["departure_date"]
    departure_time = row["departure_time"]

    annotations = {"entities": []}

    # Define annotations for named entities
    if departure:
        start_pos = sentence.find(departure)
        end_pos = start_pos + len(departure)
        annotations["entities"].append((start_pos, end_pos, "departure"))

    if arrival:
        start_pos = sentence.find(arrival)
        end_pos = start_pos + len(arrival)
        # Check for overlap with existing entities
        if not any(start <= start_pos < end or start < end_pos <= end for start, end, _ in annotations["entities"]):
            annotations["entities"].append((start_pos, end_pos, "arrival"))

    if transit:
        start_pos = sentence.find(transit)
        end_pos = start_pos + len(transit)
        if not any(start <= start_pos < end or start < end_pos <= end for start, end, _ in annotations["entities"]):
            annotations["entities"].append((start_pos, end_pos, "transit"))

    if departure_day:
        start_pos = sentence.find(departure_day)
        end_pos = start_pos + len(departure_day)
        annotations["entities"].append((start_pos, end_pos, "departure_day"))

    if departure_date:
        start_pos = sentence.find(departure_date)
        end_pos = start_pos + len(departure_date)
        annotations["entities"].append((start_pos, end_pos, "departure_date"))

    if departure_time:
        start_pos = sentence.find(departure_time)
        end_pos = start_pos + len(departure_time)
        annotations["entities"].append((start_pos, end_pos, "departure_time"))

    train_data.append((sentence, annotations))

train_data[0]

('Mon itinéraire prévu est de Bellegarde à Dreuil-lès-Amiens, avec un arrêt à Givors.',
 {'entities': [(28, 38, 'departure'),
   (41, 58, 'arrival'),
   (76, 82, 'transit')]})

## Train model

In [11]:
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    nlp.update([example], drop=0.5)

In [12]:
model_path = "../model/ner_transport_model"

if not os.path.exists(model_path):
    nlp.to_disk(model_path)

## Evaluation

In [21]:
my_nlp = spacy.load("../model/ner_transport_model")

correct_predictions = 0
total_predictions = 0

for index, row in df_data_test.iterrows():
    my_doc = my_nlp(row["sentence"])
    isPredictionTrue = True
    for ent in my_doc.ents:
        if ent.label_ == "departure" and ent.text != row["departure"]:
            isPredictionTrue = False
        elif ent.label_ == "arrival" and ent.text != row["arrival"]:
            isPredictionTrue = False
        elif ent.label_ == "transit" and ent.text != row["transit"]:
            isPredictionTrue = False
        elif ent.label_ == "departure_day" and ent.text != row["departure_day"]:
            isPredictionTrue = False
        elif ent.label_ == "departure_date" and ent.text != row["departure_date"]:
            isPredictionTrue = False
        elif ent.label_ == "departure_time" and ent.text != row["departure_time"]:
            isPredictionTrue = False
    if isPredictionTrue:
        correct_predictions += 1

    total_predictions += 1

accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.94


## Inference

In [22]:
my_nlp = spacy.load("../model/ner_transport_model")

text = "le mercredi 27 novembre, je pars de Paris pour Marseille à 9h, en passant par Lyon, "

# Process sample text with the trained model
doc = my_nlp(text)

# Display named entities in the processed text
for ent in doc.ents:
    print(f"Entity : {ent.text}, Label : {ent.label_}")

Entity : mercredi, Label : departure_day
Entity : 27 novembre, Label : departure_date
Entity : Paris, Label : departure
Entity : Marseille, Label : arrival
Entity : 9h,, Label : departure_time
Entity : Lyon, Label : transit
