In [2]:
import sys, urllib, zipfile

%load_ext autoreload
%autoreload 2

sys.path.append("../")
from P10_02_luis.utils import *
from notebook import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [None]:
from datatypes_date_time import Timex, TimexDateHelpers

In [None]:
Timex("2021-05-12").types

In [None]:
from_dt + relativedelta(
    years=int(timex.years) if timex.years else 0,
    months=int(timex.months) if timex.months else 0,
    weeks=int(timex.weeks) if timex.weeks else 0,
    days=int(timex.days) if timex.days else 0,
    hours=int(timex.hours) if timex.hours else 0,
    minutes=int(timex.minutes) if timex.minutes else 0,
    seconds=int(timex.seconds) if timex.seconds else 0
)

In [None]:
from_dt.strftime("%Y-%m-%d")

In [None]:
min(from_dt, to_dt)

In [None]:
from_dt = datetime(
    daterange.year if daterange.year else now.year,
    daterange.month if daterange.month else now.month,
    daterange.day_of_month if daterange.day_of_month else now.day
)
from_dt

In [None]:
to_dt = from_dt + timedelta(days=int(daterange.days))
to_dt

In [None]:
from recognizers_date_time import recognize_datetime

In [None]:
tmp = recognize_datetime("1 week", "English")
tmp[0].resolution

In [None]:
tmp = recognize_datetime("between the 12th and 19th of september", "English")
tmp[0].resolution

In [None]:
tmp = recognize_datetime("the 12th or the 19th of september", "English")
tmp[0].resolution

In [None]:
tmp = recognize_datetime("september 12th of the next year", "English")
tmp[0].resolution

In [None]:
# On charge le fichier des variables d'environnement
load_dotenv("../P10_03_luis/.env", override=True)

# On charge les variables d'environnement
LUIS_AUTH_KEY = os.getenv("LUIS_AUTH_KEY")
LUIS_AUTH_ENDPOINT = os.getenv("LUIS_AUTH_ENDPOINT")

LUIS_PRED_KEY = os.getenv("LUIS_PRED_KEY")
LUIS_PRED_ENDPOINT = os.getenv("LUIS_PRED_ENDPOINT")

LUIS_APP_ID = os.getenv("LUIS_APP_ID")

In [None]:
get_env("../P10_03_luis/.env")

def get_prediction(is_staging: bool, utterance: str, verbose: bool) -> dict:
    """Renvoie une prédiction faite par l'application LUIS"""
    
    # On définie le slot à tester
    slots = "Staging" if is_staging else "Production"
    
    # On crée le client avec les informations d'authentification
    client_runtime = LUISRuntimeClient(LUIS_PRED_ENDPOINT, CognitiveServicesCredentials(LUIS_PRED_KEY))

    # On effectue la prédiction
    pred = client_runtime.prediction.get_slot_prediction(
        LUIS_APP_ID,
        slots,
        {"query" : [utterance]},
        verbose=verbose
    )
    
    return pred.as_dict()

In [None]:
pprint_dict(get_prediction(
    is_staging=False,
    utterance="hey buddy. i have some work to do looking to domesticate a savage little tea cup yorkie in pittsburgh i am available between the 12th and 19th of september i can leave from wherever, so long as it costs me under 3700. I will leave from porto alegre.",
    verbose=True
))

# Introduction

## Le jeu de données

In [None]:
# Lien vers le dataset
DATASET_URL = "https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+10%C2%A0-+D%C3%A9veloppez+un+chatbot+pour+r%C3%A9server+des+vacances/frames.zip"
DATASET_FN = "frames.json"

# On vérifie si le fichier est bien présent
if DATASET_FN in os.listdir(JSON_PATH):
    print("Tous les fichiers sont bien présents.")
# Sinon on télécharge et on extrait le fichier
else:
    print("Téléchargement des données en cours...")

    # On télécharge le .zip dans un fichier temporaire et on extrait les données
    tmp, _ = urllib.request.urlretrieve(DATASET_URL)
    with zipfile.ZipFile(tmp, "r") as f:
        # On extrait le fichier json
        f.extract(DATASET_FN, JSON_PATH)

    # On supprime le fichier temporaire
    urllib.request.urlcleanup()
    
    print("Téléchargement des données terminé.")

# Chargement des ressources

## Chargement du workspace

In [None]:
# On charge l’espace de travail Azure Machine Learning existant
ws = Workspace.from_config()

## Chargement du magasin des données

In [None]:
# On charge le magasin de données par défaut
datastore = ws.get_default_datastore()

# Exploration et analyse des données

## Chargement du fichier

In [None]:
# On charge les frames
with open("data/json/frames.json") as f:
    frames = json.load(f)

In [None]:
frames[0]

In [None]:
list(frames[0].keys())

In [None]:
list(frames[0]["turns"][0].keys())

## Analyse des textes

In [None]:
# On affiche le premier dialogue
for turn in frames[0]["turns"]:
    print("-" * 80)
    print(turn.get("text", ""))

In [None]:
# On affiche les premières utterances des 10 premiers dialogues
for i in range(10):
    print(frames[i].get("turns", [])[0].get("text", ""))

## Analyse des labels

In [None]:
pprint_dict(frames[0]["turns"][0]["labels"]["acts_without_refs"])

In [None]:
keys = defaultdict(set)
for i in frames:
    for j in i["turns"][0]["labels"]["acts_without_refs"]:
        for k in j["args"]:
            try:
                keys[k["key"]].add(k.get("val", ""))
            except:
                print(k)

In [None]:
{k: len(v) for k, v in keys.items()}

In [None]:
keys["intent"]

## Transformation des données

In [None]:
# Nom dde l'intent qui représente la demande de réservation d'un vol
intent_name = "book_flight"

# Mapping entre les labels du jeu de données et ceux de LUIS
label_to_entity = {
    "or_city": "from_city",
    "dst_city": "to_city",
    "str_date": "from_dt",
    "end_date": "to_dt",
    "budget": "budget"
}

In [None]:
# Conversion d'un turn du jeu de données en une utterance labellisée pour LUIS
res = turn_to_luis_utterance(frames[0]["turns"][0], intent_name, label_to_entity)

pprint_dict(res)

In [None]:
# On convertit les turns utilisateur du jeu de données pour LUIS
df = user_turns_to_luis_ds(
    frames,
    intent_name,
    label_to_entity
)
    
df.shape

In [None]:
df.head()

In [None]:
df[
    ["text_word_nb", "entity_total_nb"] + [f"{i}_nb" for i in label_to_entity.values()]
].describe().T

In [None]:
book_flight_utterances = df[(df["user_turn_id"] == 0) & (df["entity_total_nb"] == len(label_to_entity) - 1)]
len(book_flight_utterances)

In [None]:
none_utterances = df[(df["user_turn_id"] == 0) & (df["entity_total_nb"] == 0)]
len(none_utterances)

In [None]:
none_utterances = none_utterances.sample(len(book_flight_utterances), random_state=RANDOM_SEED)
len(none_utterances)

In [None]:
utterances = pd.concat([book_flight_utterances, none_utterances])

## Split des données

In [None]:
utterances_train = utterances.sample(frac=0.7, random_state=RANDOM_SEED)

In [None]:
utterances_test_idx = utterances.index.difference(utterances_train.index)
utterances_test = utterances.loc[utterances_test_idx]

In [None]:
utterances_train.shape, utterances_test.shape

In [None]:
utterances_train = utterances_train[["text", "intentName", "entityLabels"]].to_dict("records")

In [None]:
pprint_dict(utterances_train[0])

In [None]:
utterances_test = utterances_test[["text", "intentName", "entityLabels"]].to_dict("records")

In [None]:
from typing import List, Dict

from pydantic import BaseModel

In [None]:
class EntityBaseModel(BaseModel):
    entity: str
    startPos: int
    endPos: int
    children: List["EntityBaseModel"] = []

    def __init__(self, **kwargs):
        kwargs["entity"] = kwargs["entityName"]
        kwargs["startPos"] = kwargs["startCharIndex"]
        kwargs["endPos"] = kwargs["endCharIndex"]
        super().__init__(**kwargs)

class TestUterranceBaseModel(BaseModel):
    text: str
    intent: str
    entities: List[EntityBaseModel]

    def __init__(self, **kwargs):
        kwargs["intent"] = kwargs["intentName"]
        kwargs["entities"] = kwargs["entityLabels"]
        super().__init__(**kwargs)
        
def conv_example_to_utterance_format(utterance):
    """"""
    return TestUterranceBaseModel(**utterance).dict()

In [None]:
utterances_test = {
    "LabeledTestSetUtterances": list(map(conv_example_to_utterance_format, utterances_test))
}

In [None]:
utterances_test["LabeledTestSetUtterances"][0]

## Enregistrement des datasets

In [None]:
with tempfile.TemporaryDirectory() as tmp_dir_name:
    # On enregistre les données
    file_path = os.path.join(tmp_dir_name, "utterances_train.json")
    with open(file_path, "w") as f:
        json.dump(list(map(conv_example_to_utterance_format, utterances_train)), f)
        
    file_path = os.path.join(tmp_dir_name, "utterances_test.json")
    with open(file_path, "w") as f:
        json.dump(utterances_test, f)
    
    # On upload tous les fichiers dans le datastore
    ds = Dataset.File.upload_directory(
        tmp_dir_name,
        target=(datastore, "utterances/" + datetime.now().strftime("%Y_%m_%d")),
        overwrite=True,
        show_progress=True
    )

In [None]:
ds = ds.register(
    workspace=ws,
    name="utterances",
    description="Train and test utterances",
    create_new_version=True
)

In [None]:
ds.version

## Chargement des données

In [None]:
with tempfile.TemporaryDirectory() as tmp_dir_name:
    dataset = Dataset.get_by_name(ws, name='utterances')
    dataset.download(target_path=tmp_dir_name, overwrite=False)
    
    file_path = os.path.join(tmp_dir_name, "utterances_train.json")
    with open(file_path) as f:
        utterances_train = json.load(f)
        
    file_path = os.path.join(tmp_dir_name, "utterances_test.json")
    with open(file_path) as f:
        utterances_test = json.load(f)

In [None]:
train_json[0]

In [None]:
test_json["LabeledTestSetUtterances"][0]

# Création du modèle LUIS

In [None]:
from azure.cognitiveservices.language.luis.authoring import LUISAuthoringClient
from azure.cognitiveservices.language.luis.authoring.models import ApplicationCreateObject, AzureAccountInfoObject, LuisApp
from azure.cognitiveservices.language.luis.runtime import LUISRuntimeClient
from msrest.authentication import CognitiveServicesCredentials

import json, time, uuid

In [None]:
from dotenv import load_dotenv, set_key

load_dotenv("../P10_03_luis/.env")

In [None]:
LUIS_AUTH_KEY = os.getenv("LUIS_AUTH_KEY")
LUIS_AUTH_ENDPOINT = os.getenv("LUIS_AUTH_ENDPOINT")

LUIS_PRED_KEY = os.getenv("LUIS_PRED_KEY")
LUIS_PRED_ENDPOINT = os.getenv("LUIS_PRED_ENDPOINT")

In [None]:
# On s'authentifie
client = LUISAuthoringClient(LUIS_AUTH_ENDPOINT, CognitiveServicesCredentials(LUIS_AUTH_KEY))

In [None]:
# We use a UUID to avoid name collisions.
app_name = "p10-luis-app"
app_version = "0.1"
intent_name = "book_flight"

In [None]:
# define app basics
app_def = ApplicationCreateObject(name=app_name, initial_version_id=app_version, culture='en-us')

# create app
app_id = client.apps.add(app_def)

# get app id - necessary for all other changes
print("Created LUIS app with ID {}".format(app_id))

In [None]:
set_key("../P10_03_luis/.env", "LUIS_APP_ID", app_id)

In [None]:
client.model.add_intent(app_id, app_version, intent_name)

In [None]:
# Add Prebuilt entity
client.model.add_prebuilt(
    app_id,
    app_version,
    prebuilt_extractor_names=[
        "money",
        "datetimeV2",
        "geographyV2"
    ]
)

In [None]:
entity_to_id = {
    "from_city": "",
    "to_city": "",
    "from_dt": "",
    "to_dt": "",
    "budget": ""
}

In [None]:
for entity_name in entity_to_id.keys():
    # add entity to app
    entity_to_id[entity_name] = client.model.add_entity(
        app_id,
        app_version,
        name=entity_name
    )

entity_to_id

In [None]:
# add phrase list to app
client.features.add_phrase_list(
    app_id,
    app_version,
    {
        "enabledForAllModels": False,
        "isExchangeable": True,
        "name": "from_phrase_list",
        "phrases": "from,start at,begin from,leave"
    }
)

# add phrase list to app
client.features.add_phrase_list(
    app_id,
    app_version,
    {
        "enabledForAllModels": False,
        "isExchangeable": True,
        "name": "to_phrase_list",
        "phrases": "to,arrive,land at,go,going,stay,heading"
    }
)

In [None]:
client.features.add_entity_feature(
    app_id,
    app_version,
    entity_to_id["to_city"],
    {"model_name": "geographyV2", "is_required": True}
)

# add phrase list as feature to subentity model
client.features.add_entity_feature(
    app_id,
    app_version,
    entity_to_id["to_city"],
    {"feature_name": "to_phrase_list", "model_name": None}
)

In [None]:
client.features.add_entity_feature(
    app_id,
    app_version,
    entity_to_id["from_city"],
    {"model_name": "geographyV2", "is_required": True}
)

# add phrase list as feature to subentity model
client.features.add_entity_feature(
    app_id,
    app_version,
    entity_to_id["from_city"],
    {"feature_name": "from_phrase_list", "model_name": None}
)

In [None]:
client.features.add_entity_feature(
    app_id,
    app_version,
    entity_to_id["budget"],
    {"model_name": "money", "is_required": True}
)

In [None]:
train_json

In [None]:
# for utterance in train_json:
#     try:
#         client.examples.add(app_id, app_version, utterance, { "enableNestedChildren": True })
#     except:
#         print(json.dumps(utterance, indent=2))
        
client.examples.batch(
    app_id,
    app_version,
    train_utterances,
    enable_nested_children=True
)

In [None]:
get_env("../P10_03_luis/.env")

In [None]:
train(app_version)

In [None]:
deploy(app_version, is_staging=True)

In [None]:
pred = get_prediction(
    is_staging=True,
    utterance="I want to go from Paris to London the 25/12/2021 and stay one week. I have only 512€."
)

pprint_dict(pred)

In [None]:
res = evaluate(is_staging=True, utterances=test_utterances)
res

## Enregistrement des paramètres du modèle

In [None]:
params = get_params(app_version)

In [None]:
params["utterances"] = []
params["versionId"] = 0.1

In [None]:
model_config = {
    "model": params,
    "dataset": {
        "name": ds.name,
        "version": ds.version,
    }
}

In [None]:
file_path = os.path.join("../P10_03_luis", "params.json")
with open(file_path, "w") as f:
    json.dump(model_config, f)

## Suppression du modèle

In [None]:
delete(app_version)

In [None]:
azure_workspace = {
  "resourceGroup": "p10-ml-rg",
  "workspaceName": "p10-ml-ws"
}