In [1]:
from typing import List
from pathlib import Path
import pandas as pd
import random

from rasa.shared.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.shared.nlu.training_data.entities_parser import parse_training_example

### Load Entities

In [2]:
entities = pd.read_csv('.data/entities.csv')
doctors = entities[entities['type'] == 'doctor']['label'].values
services = entities[entities['type'] == 'service']['label'].values

### Helpers

In [3]:
def parse_entity(value : str, label : str) -> str:
    """
        Shortcut for add entity to examples sentences.
        
        Parameters
        ----------
        value : str
            Entity value.
        label : str
            Entity label.
        
        Return
        ------
        str
            Well formated entity.
        
        Examples
        -------- 
        >>> parse_entity('paris', 'loc')
        [paris](loc)
    """
    
    return f'[{value}]({label})'

def parse_entities(values : List[str], label : str) -> List[str]:
    """
        Shortcut to apply ''parse_entity'' on a ''values'' list with a single ''label''. 
        
        
        Parameters
        ----------
        values : str
            List of entities values.
        label : str
            Entities label to apply.
                
        Return
        ------
        List[str]
            Well formated entities.        
    """
    
    return [parse_entity(value, label) for value in values]

In [4]:
def generate_sents(sent : str, parts : List[List[str]], output : List[str]) -> None:
    """
    Generate sentences based on multiple parts lists.

    This is a recursive function. For each parts list, add each parts to the base sentence. 
    If it's the last parts list, add result to output.
    Else, call this function with new base sentence and updated parts list (removing the used list).

    Parameters
    ----------
    sent : str
        Base sentence. Start with empty string, recursivly get improved by each part.
    parts : list[list[str]]
        The multiple list of parts wich will improve recursivly the base sentence.
    output : list[str]
        Final sentences list. Get filled during the recursive process.

    """
    
    # if there is only 1 parts list remaning
    if len(parts) == 1:
         # for each part concat with base sentence and add it to output
        for part in parts[0]:
            output.append(f'{sent} {part}'.strip())            
    # if there is more than 1 parts list remaning
    else:
        # for each part concat with base sentence and call this function with new base sentence and updated parts list
        for part in parts[0]:
            generate_sents(f'{sent} {part}', parts[1:], output)

### Conjugate verbs

In [5]:
from bs4 import BeautifulSoup
import requests
import re

mode_temps = ['Indicative_Present', 'Indicative_Imperfect', 'Conditional_Present', 'Conditional_First past']
pronoms = [0, 2, 3, 5]

def is_bloc(tag):
    return tag.has_attr('class') and ('modeBloc' in tag.attrs['class'] or 'conjugBloc' in tag.attrs['class'])

def get_verb_conjugate(verb):
    sents = []
    res = requests.get(f'https://leconjugueur.lefigaro.fr/french/verb/{verb}.html')
    print(f'{verb} : {res.status_code}')
    soup = BeautifulSoup(res.text)
    mode = ''
    for tag in soup.find_all(is_bloc):
        if 'modeBloc' in tag.attrs['class']:
            mode = tag.text
        else:
            temps = tag.find('div', attrs={'class':'tempsBloc'})
            if temps and f'{mode}_{temps.text}' in mode_temps:
                text = list(tag.children)[1]
                text = re.sub(r'<\/?(b|p)>', '', str(text))
                sents += [sent for i, sent in enumerate(text.split('<br/>')) if i in pronoms]
    return sents
            

In [6]:
infinitive_verbs = ['demander', 'vouloir', 'reclamer', 'aimer', 'desirer', 'aspirer', 'esperer', 'exiger', 'souhaiter', 'rever']

verbs = {}
for verb in infinitive_verbs:
    verbs[verb] = get_verb_conjugate(verb)

demander : 200
vouloir : 200
reclamer : 200
aimer : 200
desirer : 200
aspirer : 200
esperer : 200
exiger : 200
souhaiter : 200
rever : 200


### Generate doctor sents

In [7]:
wants = []
for verb in ['vouloir', 'aimer', 'desirer', 'aspirer', 'souhaiter', 'esperer']:
    wants += verbs[verb]

for verb in ['reclamer', 'exiger', 'rever']:
    wants += [f'{v} de' for v in verbs[verb]]
    
for verb in ['demander']:
    wants += [f'{v} à' for v in verbs[verb]]
    

contacts = ['parler', 'causer', 'telephoner'] 
wichs = ['avec le docteur', 'avec la docteur', 'au docteur', 
 'avec le médecin', 'avec la médecin', 'au médecin',
 'avec Mr', 'avec Mme', 'à Monsieur', 'à Madame',
 'à Mr', 'à Mme', 'à Monsieur', 'à Madame']
targets = parse_entities([doc.split()[0] for doc in doctors] + [doc for doc in doctors], 'doctor')

In [9]:
doctor_sents = []
generate_sents('', [wants, contacts, wichs, targets], doctor_sents)
len(doctor_sents)

403200

In [10]:
rsents = list(doctor_sents)
random.shuffle(rsents)
rsents[:25]

['il rêvait de parler à Madame [ALONSO Enrique](doctor)',
 'il veut parler à Madame [BRYON Dominique](doctor)',
 'il espère parler au docteur [BENSAID Cherazade](doctor)',
 'je souhaitais parler à Madame [BRUNET](doctor)',
 'je rêve de telephoner à Mr [BRUNET](doctor)',
 'je réclamerais de causer à Mme [BENSAID](doctor)',
 'nous espérons parler à Mme [BAUER Bertrand](doctor)',
 'ils exigeraient de parler au médecin [BENSAID](doctor)',
 "j'aurais réclamé de parler au docteur [BADO Fabrice](doctor)",
 "j'aurais aimé causer avec le docteur [BRUNET](doctor)",
 "j'aurais demandé à causer à Madame [ALONSO](doctor)",
 'nous rêvons de causer à Monsieur [BRUNET](doctor)',
 'il réclamait de causer avec Mme [BELDA Eric](doctor)',
 'il souhaiterait telephoner avec Mr [BAUER Bertrand](doctor)',
 'nous exigeons de telephoner au médecin [BELMAGHNI Fatma](doctor)',
 'ils désireraient parler avec le docteur [ALONSO Enrique](doctor)',
 'il voulait causer avec le docteur [BOULET](doctor)',
 'ils souhaite

### Generate service sents

In [11]:
wichs = ['avec le service', 'au service', 
         'avec le secrétariat', 'au secrétariat', 
         'avec le secrétariat du service', 'au secrétariat du service', 
         'avec l\'acceuil', 'à l\'acceuil'
         'avec l\'acceuil du service', 'à l\'acceuil du service', 
        ]
targets = parse_entities([service.lower() for service in services], 'service')

In [13]:
service_sents = []
generate_sents('', [wants, contacts, wichs, targets], service_sents)
len(service_sents)

129600

In [14]:
rsents = list(service_sents)
random.shuffle(rsents)
rsents[:25]

['je souhaiterais causer au secrétariat [stomatologie](service)',
 "j'aurais souhaité causer au secrétariat du service [accueil](service)",
 "j'aurais réclamé de telephoner avec le service [ressources humaines](service)",
 'ils espèrent causer au secrétariat [rythmologie](service)',
 'ils demandaient à causer au service [ophtalmologie](service)',
 'nous voulions parler avec le service [orl](service)',
 "ils souhaitent causer avec l'acceuil [sophrologie](service)",
 "je demanderais à parler à l'acceuil du service [assistante hôtelière](service)",
 'nous souhaitions causer avec le service [urgence](service)',
 "je veux telephoner à l'acceuilavec l'acceuil du service [stomathérapie](service)",
 "j'aime causer au secrétariat du service [pédiatrie](service)",
 'il souhaiterait causer au secrétariat du service [chirurgie vasculaire](service)',
 "j'aime causer avec l'acceuil [planification](service)",
 'il exigeait de telephoner au secrétariat du service [orl](service)',
 "j'aurais exigé de t

### Make yaml files

In [15]:
data = RasaYAMLReader().reads(Path('.data/nlu.yml').read_text('utf-8'))
data.intent_examples

[0m

[<rasa.shared.nlu.training_data.message.Message at 0x247adadc6a0>,
 <rasa.shared.nlu.training_data.message.Message at 0x247ae485640>,
 <rasa.shared.nlu.training_data.message.Message at 0x247ae4908b0>]

In [16]:
data.lookup_tables[0]['elements'] = list(services)

In [17]:
data.lookup_tables[1]['elements'] = list([doc.split()[0] for doc in doctors] + [doc for doc in doctors])

In [18]:
data.training_examples.clear()

In [19]:
service_rsents = list(service_sents)
random.shuffle(service_rsents)

In [20]:
for sent in service_rsents[:10000]:
    data.training_examples.append(parse_training_example(sent, 'contact_service'))

In [21]:
doctor_rsents = list(doctor_sents)
random.shuffle(doctor_rsents)

In [22]:
for sent in doctor_rsents[:10000]:
    data.training_examples.append(parse_training_example(sent, 'contact_doctor'))

In [23]:
data.persist_nlu('data/nlu_0.yml')