In [31]:
import pandas as pd

In [32]:
data = pd.read_csv("dataset.csv", delimiter=';')

In [33]:
train_data = []

In [None]:
for _, row in data.iterrows():
    text = row['text']
    origin = row['origin']
    destination = row['destination']
    detours = row['detours'] if pd.notna(row['detours']) else ""
    
    if origin != "NOT_FRENCH" and origin != "NOT_TRIP":
        start_origin = text.lower().find(origin)
        end_origin = start_origin + len(origin)
        start_destination = text.lower().find(destination)
        end_destination = start_destination + len(destination)
        detour_positions = []
        
        if detours:
            for detour in detours.split(","):
                detour = detour.strip()
                start_detour = text.lower().find(detour)
                end_detour = start_detour + len(detour)
                
                if start_detour >= 0 and (
                    end_detour <= start_origin or start_detour >= end_origin
                ) and (
                    end_detour <= start_destination or start_detour >= end_destination
                ):
                    overlap = any(
                        (start < end_detour and end > start_detour)
                        for start, end, _ in detour_positions
                    )
                    if not overlap:
                        detour_positions.append((start_detour, end_detour, "DETOUR"))
        
        if start_origin >= 0 and start_destination >= 0 and end_origin <= start_destination:
            entities = [
                (start_origin, end_origin, "ORIGIN"),
                (start_destination, end_destination, "DESTINATION")
            ] + detour_positions

            train_data.append((text, {"entities": entities}))

In [2]:
import spacy
from spacy.training.example import Example
import random



In [36]:
nlp = spacy.blank("fr")

In [37]:
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

In [38]:
ner.add_label("ORIGIN")
ner.add_label("DESTINATION")
ner.add_label("DETOUR")

1

In [39]:
optimizer = nlp.begin_training()

In [40]:
for itn in range(60):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)
    print(f"Itération {itn} - Losses: {losses}")




Itération 0 - Losses: {'ner': 11969.149000367523}
Itération 1 - Losses: {'ner': 5189.011326624429}
Itération 2 - Losses: {'ner': 4475.519438747725}
Itération 3 - Losses: {'ner': 3860.095450588652}
Itération 4 - Losses: {'ner': 3476.853364620073}
Itération 5 - Losses: {'ner': 3072.7157165446274}
Itération 6 - Losses: {'ner': 2914.55863748885}
Itération 7 - Losses: {'ner': 2646.7531106838887}
Itération 8 - Losses: {'ner': 2564.025007900898}
Itération 9 - Losses: {'ner': 2525.6928007663914}
Itération 10 - Losses: {'ner': 2378.3990443473813}
Itération 11 - Losses: {'ner': 2207.2721563137716}
Itération 12 - Losses: {'ner': 2121.6514079720614}
Itération 13 - Losses: {'ner': 2025.8923942965098}
Itération 14 - Losses: {'ner': 2000.3460953482615}
Itération 15 - Losses: {'ner': 1939.918488698682}
Itération 16 - Losses: {'ner': 1936.5745040469978}
Itération 17 - Losses: {'ner': 1811.6058338802627}
Itération 18 - Losses: {'ner': 1917.677065441407}
Itération 19 - Losses: {'ner': 1803.4393595372612}

In [None]:
# range essayé : 10, 20

In [18]:
nlp.to_disk("model_test")

In [3]:
nlp = spacy.load('./model_test')

In [4]:
from langdetect import detect, LangDetectException

In [5]:
def is_french(text):
    try:
        return detect(text) == "fr"
    except LangDetectException:
        return False

In [6]:
from unidecode import unidecode
import re

In [7]:
def test_trip_request(text):
    if not is_french(text):
        return "NOT_FRENCH"
    
    text = unidecode(text).lower()
    text = re.sub(r'[^\w\s]', '', text)

    doc = nlp(text)
    origin, destination = None, None
    detours = []

    for ent in doc.ents:
        if ent.label_ == "ORIGIN":
            origin = ent.text
        elif ent.label_ == "DESTINATION":
            destination = ent.text
        elif ent.label_ == "DETOUR":
            detours.append(ent.text)
    
    if origin and destination:
        return (text, origin, destination, detours)
    else:
        return "NOT_TRIP"

In [9]:
def show_infos(origin, destination, detours):
    print(f'Depart: {origin}')
    print(f'Arrivée: {destination}')
    detours_sentence = ""
    for i in range(len(detours)):
        if(i == len(detours) - 1):
            detours_sentence += detours[i]
        else:
            detours_sentence += detours[i] + ", "
    print(f'Détours: {detours_sentence if len(detours) > 0 else "Aucun"}\n')

In [8]:
test_phrases = [
    "Je veux aller de paris à lyon",
    "J'aimerai aller de lille à nice",
    "Voyage de rouen jusqu'à nice",
    "Quel est le trajet de toulouse à bordeaux ?",
    "Je veux aller de paris à lyon en passant par nice",
    "En passant par toulouse, je veux aller de paris à lyon",
    "J'aime bien les restaurants de paris",
    "What time is it in Paris ?",
    "Quel est le trajet de strasbourg à bordeaux ?",
    "Quel est le trajet de bordeaux à strasbourg en passant par lyon ?",
    "Comment me rendre à strasbourg depuis nice ?",
    "En passant par Lyon, j'aimerai aller à Nice depuis Strasbourg",
]

In [9]:
for phrase in test_phrases:
    print(f"sentence: {phrase}")
    print(test_trip_request(phrase.lower()))

sentence: Je veux aller de paris à lyon
('je veux aller de paris a lyon', 'paris', 'lyon', [])
sentence: J'aimerai aller de lille à nice
('jaimerai aller de lille a nice', 'lille', 'nice', [])
sentence: Voyage de rouen jusqu'à nice
('voyage de rouen jusqua nice', 'rouen', 'nice', [])
sentence: Quel est le trajet de toulouse à bordeaux ?
('quel est le trajet de toulouse a bordeaux ', 'toulouse', 'bordeaux', [])
sentence: Je veux aller de paris à lyon en passant par nice
('je veux aller de paris a lyon en passant par nice', 'paris', 'lyon', ['nice'])
sentence: En passant par toulouse, je veux aller de paris à lyon
('en passant par toulouse je veux aller de paris a lyon', 'paris', 'lyon', ['toulouse'])
sentence: J'aime bien les restaurants de paris
NOT_TRIP
sentence: What time is it in Paris ?
NOT_FRENCH
sentence: Quel est le trajet de strasbourg à bordeaux ?
('quel est le trajet de strasbourg a bordeaux ', 'strasbourg', 'bordeaux', [])
sentence: Quel est le trajet de bordeaux à strasbour

In [14]:
phrase = "Voyage de rouen jusqu'à nice"
phrase = unidecode(phrase).lower()
phrase = re.sub(r'[^\w\s]', '', phrase)
test_trip_request(phrase)

('voyage de rouen jusqua nice', 'rouen', 'nice', [])