In [1]:
import pandas as pd

In [64]:
data = pd.read_csv("datasets/spacy_ds.csv", delimiter=';')

In [65]:
train_data = []

In [None]:
for _, row in data.iterrows():
    text = row['text']
    origin = row['origin']
    destination = row['destination']
    detours = row['detours'] if pd.notna(row['detours']) else ""
    
    if origin != "NOT_TRIP":
        start_origin = text.lower().find(origin)
        end_origin = start_origin + len(origin)
        start_destination = text.lower().find(destination)
        end_destination = start_destination + len(destination)
        detour_positions = []
        
        if detours:
            for detour in detours.split(","):
                detour = detour.strip()
                start_detour = text.lower().find(detour)
                end_detour = start_detour + len(detour)
                
                if start_detour >= 0 and (
                    end_detour <= start_origin or start_detour >= end_origin
                ) and (
                    end_detour <= start_destination or start_detour >= end_destination
                ):
                    overlap = any(
                        (start < end_detour and end > start_detour)
                        for start, end, _ in detour_positions
                    )
                    if not overlap:
                        detour_positions.append((start_detour, end_detour, "DETOUR"))
        
        if start_origin >= 0 and start_destination >= 0 and end_origin <= start_destination:
            entities = [
                (start_origin, end_origin, "ORIGIN"),
                (start_destination, end_destination, "DESTINATION")
            ] + detour_positions

            train_data.append((text, {"entities": entities}))

In [None]:
import spacy
from spacy.training.example import Example
import random

In [67]:
nlp = spacy.blank("fr")

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

ner.add_label("ORIGIN")
ner.add_label("DESTINATION")
ner.add_label("DETOUR")

optimizer = nlp.begin_training()

In [None]:
for itn in range(10):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)
    print(f"Itération {itn} - Losses: {losses}")

In [94]:
nlp.to_disk("models/spacy_model")

In [12]:
nlp = spacy.load('./model_test')

In [69]:
from langdetect import detect, LangDetectException

In [70]:
def is_french(text):
    try:
        return detect(text) == "fr"
    except LangDetectException:
        return False

In [71]:
from unidecode import unidecode
import re

In [72]:
def test_trip_request(text):
    if not is_french(text):
        return "NOT_FRENCH"
    
    text = unidecode(text).lower()
    text = re.sub(r'[^\w\s]', '', text)

    doc = nlp(text)
    origin, destination = None, None
    detours = []

    for ent in doc.ents:
        if ent.label_ == "ORIGIN":
            origin = ent.text
        elif ent.label_ == "DESTINATION":
            destination = ent.text
        elif ent.label_ == "DETOUR":
            detours.append(ent.text)
    
    if origin and destination:
        return (text, origin, destination, detours)
    else:
        return "NOT_TRIP"

In [73]:
def show_infos(origin, destination, detours):
    print(f'Depart: {origin}')
    print(f'Arrivée: {destination}')
    detours_sentence = ""
    for i in range(len(detours)):
        if(i == len(detours) - 1):
            detours_sentence += detours[i]
        else:
            detours_sentence += detours[i] + ", "
    print(f'Détours: {detours_sentence if len(detours) > 0 else "Aucun"}\n')

In [88]:
test_phrases = [
    "Je veux aller de paris à vendenheim",
    "J'aimerai aller de lille à nice",
    "Voyage de rouen jusqu'à nice",
    "Quel est le trajet de toulouse à bordeaux ?",
    "Je veux aller de paris à lyon en passant par nice",
    "En passant par toulouse, je veux aller de paris à lyon",
    "What time is it in Paris ?",
    "Quel est le trajet de strasbourg à bordeaux ?",
    "Quel est le trajet de bordeaux à strasbourg en passant par lyon ?",
    "Comment me rendre à strasbourg depuis nice ?",
    "En passant par Lyon, j'aimerai aller à Nice depuis Strasbourg",
]

In [None]:
for phrase in test_phrases:
    print(f"sentence: {phrase}")
    print(test_trip_request(phrase.lower()))

In [None]:
phrase = "j'aimerais me rendre à lyon depuis strasbourg"
test_trip_request(phrase)