In [1]:
from bs4 import BeautifulSoup
from pathlib import Path
import requests
import re

In [2]:
URL = 'https://leconjugueur.lefigaro.fr/french/verb'

def is_bloc(tag):
    return tag.has_attr('class') and ('modeBloc' in tag.attrs['class'] or 'conjugBloc' in tag.attrs['class'])

class Verb:
    def __init__(self, inf, mode_temps, pronoms, freq, prefix='', suffix=''):
        self.inf = inf
        self.mode_temps = mode_temps
        self.pronoms = pronoms
        self.freq = freq
        self.prefix = prefix
        self.suffix = suffix
        self.data = {}
        self.scrap()
    
    def scrap(self):
        res = requests.get(f'{URL}/{self.inf}.html')
        soup = BeautifulSoup(res.text)
        mode = ''
        for tag in soup.find_all(is_bloc):
            if 'modeBloc' in tag.attrs['class']:
                mode = tag.text
            else:
                temps = tag.find('div', attrs={'class':'tempsBloc'})
                if temps:
                    text = list(tag.children)[1]
                    text = re.sub(r'<\/?(b|p)>', '', str(text))
                    self.data[f'{mode}_{temps.text}'] = text.split('<br/>')
 
    def is_valid(self, mode_temp, pronom, freq):
        return mode_temp in self.mode_temps and pronom in self.pronoms and freq == self.freq
    
    def get_text(self, mode_temp, pronom):
        return f'{self.prefix}{self.data[mode_temp][pronom]}{self.suffix}'

In [3]:
FREQUENCES = ['commun', 'rare']

MODE_TEMPS_FULL = ['Indicative_Present', 'Indicative_Imperfect', 'Conditional_Present', 'Conditional_First past']
MODE_TEMPS_PRESENT = ['Indicative_Present', 'Conditional_Present']
MODE_TEMPS_CONDITIONAL = ['Conditional_Present', 'Conditional_First past']

PRONOMS_FULL = [0, 2, 3, 5]

verbs = [
    Verb('demander', MODE_TEMPS_FULL, PRONOMS_FULL, 'commun', suffix=' à'),
    Verb('pouvoir', MODE_TEMPS_PRESENT, PRONOMS_FULL, 'commun'),
    Verb('vouloir', MODE_TEMPS_FULL, PRONOMS_FULL, 'commun'),
    Verb('desirer', MODE_TEMPS_FULL, PRONOMS_FULL, 'commun'),
    Verb('souhaiter', MODE_TEMPS_FULL, PRONOMS_FULL, 'commun'),
    Verb('aimer', MODE_TEMPS_CONDITIONAL, PRONOMS_FULL, 'commun'),
    Verb('exiger', MODE_TEMPS_FULL, PRONOMS_FULL, 'rare', suffix=' de'),
    Verb('rever', MODE_TEMPS_FULL, PRONOMS_FULL, 'rare', suffix=' de'),
    Verb('reclamer', MODE_TEMPS_FULL, PRONOMS_FULL, 'rare', suffix=' de'),
    Verb('esperer', MODE_TEMPS_FULL, PRONOMS_FULL, 'rare'),
    Verb('aspirer', MODE_TEMPS_FULL, PRONOMS_FULL, 'rare', suffix=' à'),
]

In [4]:
def get_block(prefix, verbs, mode_temp, pronom, freq):
    block = []
    for v in verbs:
        if v.is_valid(mode_temp, pronom, freq):
            block.append(v.get_text(mode_temp, pronom))
    return f'~[{prefix}_{mode_temp.lower()}_{pronom}_{freq}]\n    ' + '\n    '.join(block)
    
blocks = []

for mt in MODE_TEMPS_FULL:
    for p in PRONOMS_FULL:
        for f in FREQUENCES:
            blocks.append(get_block('vouloir', verbs, mt, p, f))

In [5]:
x1 = .70
x2 = .1 / (1-x1)
x3 = .1 / (1-x2) / (1-x1)
print(f'x1 : {x1*100}, x2 : {x2*100}, x3 : {x3*100}, p4 : {(1-x1) * (1-x2) * (1-x3) * 100}')

x1 : 70.0, x2 : 33.33333333333333, x3 : 49.999999999999986, p4 : 10.000000000000005


In [6]:
#########################
### PROBA MODE_TEMPS ####
#########################
# indicative_present : 55 
# indicative_imperfect : 10
# conditional_present : 25 
# conditional_first : 10
# x1 = .55 
# x2 = .1 / (1-x1) = .2222
# x3 = .25 / (1-x2) / (1-x1) = .7142
#########################
##### PROBA PRONOMS #####
#########################
# je (0) : 70
# il/elle (2) : 10
# nous (2) : 10
# ils/elles (2) : 10
# x1 = .70
# x2 = .1 / (1-x1) = .3333
# x3 = .1 / (1-x2) / (1-x1) = .4999
#########################
### PROBA FREQUENCES ####
#########################
# commun : 80
# rare : 20

PROBS = {
    'indicative_present' : '55',  
    'indicative_imperfect' : '22',
    'conditional_present' : '72',
    0 : '70', 
    2 : '33',
    3 : '50',
    'commun' : '80',
}

def generate_rules(k, arr):
    res = ''
    last_rule = (f'{k}_{arr[-1]}', len(arr)-1)
    for _ in range(len(arr)-1):
        if last_rule[1] == 1:
            current_rule = (k, last_rule[1] - 1)
        else:
            current_rule = (f'{k}-{last_rule[1]}', last_rule[1] - 1)
        s = arr[last_rule[1] - 1]
        p = PROBS[s]
        res += f'~[{current_rule[0]}]\n'
        res += f'    ~[{k}_{s}?{last_rule[1]}-{k}/{p}] ~[{last_rule[0]}?!{last_rule[1]}-{k}]\n\n'
        last_rule = current_rule
    return res.strip()

rules = []
rules.append(generate_rules(f'vouloir', [mt.lower() for mt in MODE_TEMPS_FULL]))        
for mt in MODE_TEMPS_FULL:
    rules.append(generate_rules(f'vouloir_{mt.lower()}', PRONOMS_FULL))        
    for p in PRONOMS_FULL:
        rules.append(generate_rules(f'vouloir_{mt.lower()}_{p}', FREQUENCES))        

In [7]:
print(generate_rules('indicative_present', PRONOMS_FULL))

~[indicative_present-3]
    ~[indicative_present_3?3-indicative_present/50] ~[indicative_present_5?!3-indicative_present]

~[indicative_present-2]
    ~[indicative_present_2?2-indicative_present/33] ~[indicative_present-3?!2-indicative_present]

~[indicative_present]
    ~[indicative_present_0?1-indicative_present/70] ~[indicative_present-2?!1-indicative_present]


In [8]:
print(generate_rules('indicative_present_0', FREQUENCES))

~[indicative_present_0]
    ~[indicative_present_0_commun?1-indicative_present_0/80] ~[indicative_present_0_rare?!1-indicative_present_0]


In [9]:
print('\n\n'.join(blocks))

~[vouloir_indicative_present_0_commun]
    je demande à
    je peux
    je veux
    je désire
    je souhaite

~[vouloir_indicative_present_0_rare]
    j'exige de
    je rêve de
    je réclame de
    j'espère
    j'aspire à

~[vouloir_indicative_present_2_commun]
    il demande à
    il peut
    il veut
    il désire
    il souhaite

~[vouloir_indicative_present_2_rare]
    il exige de
    il rêve de
    il réclame de
    il espère
    il aspire à

~[vouloir_indicative_present_3_commun]
    nous demandons à
    nous pouvons
    nous voulons
    nous désirons
    nous souhaitons

~[vouloir_indicative_present_3_rare]
    nous exigeons de
    nous rêvons de
    nous réclamons de
    nous espérons
    nous aspirons à

~[vouloir_indicative_present_5_commun]
    ils demandent à
    ils peuvent
    ils veulent
    ils désirent
    ils souhaitent

~[vouloir_indicative_present_5_rare]
    ils exigent de
    ils rêvent de
    ils réclament de
    ils espèrent
    ils aspirent à

~[vouloir_indica

In [10]:
print('\n\n'.join(rules))

~[vouloir-3]
    ~[vouloir_conditional_present?3-vouloir/72] ~[vouloir_conditional_first past?!3-vouloir]

~[vouloir-2]
    ~[vouloir_indicative_imperfect?2-vouloir/22] ~[vouloir-3?!2-vouloir]

~[vouloir]
    ~[vouloir_indicative_present?1-vouloir/55] ~[vouloir-2?!1-vouloir]

~[vouloir_indicative_present-3]
    ~[vouloir_indicative_present_3?3-vouloir_indicative_present/50] ~[vouloir_indicative_present_5?!3-vouloir_indicative_present]

~[vouloir_indicative_present-2]
    ~[vouloir_indicative_present_2?2-vouloir_indicative_present/33] ~[vouloir_indicative_present-3?!2-vouloir_indicative_present]

~[vouloir_indicative_present]
    ~[vouloir_indicative_present_0?1-vouloir_indicative_present/70] ~[vouloir_indicative_present-2?!1-vouloir_indicative_present]

~[vouloir_indicative_present_0]
    ~[vouloir_indicative_present_0_commun?1-vouloir_indicative_present_0/80] ~[vouloir_indicative_present_0_rare?!1-vouloir_indicative_present_0]

~[vouloir_indicative_present_2]
    ~[vouloir_indicative_

In [11]:
Path('chatette/verbs.chatette').write_text('\n\n'.join(rules+blocks), encoding='utf-8')

9810

In [12]:
import pandas as pd
from slugify import slugify

In [13]:
entities = pd.read_csv('.data/entities.csv')
doctors = entities[entities['type'] == 'doctor']['label'].values
services = entities[entities['type'] == 'service']['label'].values

In [14]:
def indent(value, space_count=4):
    return (' ' * space_count) + value

def block(value):
    return f'~[{value}]'

def entry(value, train_size, test_size):
    return f"%[{value}]('train': '{train_size}', 'test': '{test_size}')"
    
def entity(value, label, role=''):
    if role == '':
        return f'\[{value}\]({label})'
    return f'\[{value}\]' + '\{' + f'"entity": "{label}", "role": "{role}"' + '\}'

In [15]:
def contact_doctor(doctor, train_size, test_size):
    lines = []
    slug = slugify(doctor).replace('-', '_')
    lines.append(entry(f'contact_doctor_{slug}', train_size, test_size))
    lines += contact_doctor_content(doctor)
    return lines

def contact_doctor_content(doctor):
    lines = []
    lastname = doctor.split()[0]
    lines.append(indent(block(f'contact doctor simple${entity(doctor, "doctor", "target")}')))
    lines.append(indent(block(f'contact doctor simple${entity(lastname, "doctor", "target")}')))
    lines.append(indent(block(f'contact doctor medium${entity(doctor, "doctor", "target")}')))
    lines.append(indent(block(f'contact doctor medium${entity(lastname, "doctor", "target")}')))
    return lines
    
def contact_service(service, train_size, test_size):
    lines = []
    slug = slugify(service).replace('-', '_')
    lines.append(entry(f'contact_service_{slug}', train_size, test_size))
    lines += contact_service_content(service)
    return lines
    
def contact_service_content(service):
    lines = []
    lines.append(indent(block(f'contact service simple${entity(service, "service", "target")}')))
    lines.append(indent(block(f'contact service medium${entity(service, "service", "target")}')))
    return lines

In [18]:
TRAIN_SIZE = 50
TEST_SIZE = 15
UNKOWN_TRAIN_SIZE = 200
UNKOWN_TEST_SIZE = 50
UNKOWN_SERVICE = ['lune', 'soleil', 'Paris', 'gouvernement', 'tour effeil', 'paradis', 'perpet les alouettes',
                 'clinique', 'hopital', 'charité', 'église', 'mer', 'caserne', 'restaurant', 'magasins', 'coiffeur', 'pizzeria',
                 'cabinet']
UNKOWN_DOCTOR = ['PRINCE Aladdin', 'EPRON Pierre', 'VALJEAN Jean', 'MACRON Emmanuel', 'DIEU Vivant', 'CHRIST Jesus', 'ZIDANE Zinedine',
                'CASTEX Jean', 'DUPONT philipe', 'CHRISTIE Agatha', 'D\'Arc Jeanne', 'SALDANA Ione', 'DANOURA Kobushi', 'WALKER helene', 
                 'HAMPTON Kelsie', 'CURRY Marie']

lines = []
for doctor in doctors:
    lines += contact_doctor(doctor, TRAIN_SIZE, TEST_SIZE)
    lines.append('\n')

for service in services:
    lines += contact_service(service, TRAIN_SIZE, TEST_SIZE)
    lines.append('\n')

    
lines.append(entry('contact_doctor_unknown', UNKOWN_TRAIN_SIZE, UNKOWN_TEST_SIZE))
for doctor in UNKOWN_DOCTOR:
    lines += contact_doctor_content(doctor)
lines.append('\n')

lines.append(entry('contact_service_unknown', UNKOWN_TRAIN_SIZE, UNKOWN_TEST_SIZE))
for service in UNKOWN_SERVICE:
    lines += contact_service_content(service)
lines.append('\n')

lines.append(entry('out_of_scope', UNKOWN_TRAIN_SIZE, UNKOWN_TEST_SIZE))
lines.append(indent(block('start noise')))
lines.append(indent(block('problems')))
lines.append(indent(block('from doctor')))
lines.append(indent(block('end noise')))
lines.append(indent(block('random')))
lines.append('\n')

    
lines.append('|doctors.chatette')
lines.append('|services.chatette')
lines.append('|noises.chatette')
lines.append('|verbs.chatette')
lines.append('|others.chatette')
    
print('\n'.join(lines))    
Path('chatette/main.chatette').write_text('\n'.join(lines), encoding='utf-8')

%[contact_doctor_aboab_jennifer]('train': '50', 'test': '15')
    ~[contact doctor simple$\[ABOAB Jennifer\]\{"entity": "doctor", "role": "target"\}]
    ~[contact doctor simple$\[ABOAB\]\{"entity": "doctor", "role": "target"\}]
    ~[contact doctor medium$\[ABOAB Jennifer\]\{"entity": "doctor", "role": "target"\}]
    ~[contact doctor medium$\[ABOAB\]\{"entity": "doctor", "role": "target"\}]


%[contact_doctor_aboucaya_jean_paul]('train': '50', 'test': '15')
    ~[contact doctor simple$\[ABOUCAYA Jean-Paul\]\{"entity": "doctor", "role": "target"\}]
    ~[contact doctor simple$\[ABOUCAYA\]\{"entity": "doctor", "role": "target"\}]
    ~[contact doctor medium$\[ABOUCAYA Jean-Paul\]\{"entity": "doctor", "role": "target"\}]
    ~[contact doctor medium$\[ABOUCAYA\]\{"entity": "doctor", "role": "target"\}]


%[contact_doctor_al_moussarih_abdallah]('train': '50', 'test': '15')
    ~[contact doctor simple$\[AL MOUSSARIH Abdallah\]\{"entity": "doctor", "role": "target"\}]
    ~[contact doctor s

28149

In [19]:
from rasa.shared.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
from rasa.shared.nlu.training_data.message import Message
from rasa.shared.nlu.training_data.entities_parser import parse_training_example
import json

In [20]:
def correct_encoding(text):
    chars = [
        ('\xc3\xaa', 'ê'), # 0
        ('\xc3\xa9', 'é'), # 1
        ('\xc3\xa8', 'è'), # 2
        ('\xc3\xab', 'ë'), # 3
        ('\xc3\xa2', 'â'), # 5
        ('\xc3\xa4', 'ä'), # 6
        ('\xc3\xb4', 'ô'), # 7
        ('\xc3\xb6', 'ö'), # 8
        ('\xc3\xae', 'î'), # 9
        ('\xc3\xaf', 'ï'), # 10
        ('\xc3\xbb', 'û'), # 11
        ('\xc3\xbc', 'ü'), # 12
        ('\xc3\xb9', 'ù'), # 13
        ('\xc3', 'à'),     # 4
    ]
    for i, j in chars:
        text = text.replace(i, j)
    return text

In [22]:
#### path = Path('output/train/output.json')

def load_from_chatette(path):
    examples = json.loads(correct_encoding(path.read_text()))
    data = RasaYAMLReader().reads(Path('.data/nlu_rs_base.yml').read_text('utf-8'))
    data.training_examples.clear()

    for item in examples['rasa_nlu_data']['common_examples']:
        if 'contact_doctor_' in item['intent']:
            item['intent'] = item['intent'].replace('contact_doctor_', 'contact_doctor/')
        if 'contact_service_' in item['intent']:
            item['intent'] = item['intent'].replace('contact_service_', 'contact_service/')
        data.training_examples.append(parse_training_example(item['text'], item['intent']))

    for doctor in doctors:
        slug = slugify(doctor).replace('-', '_')
        data.responses.update({f'utter_contact_doctor/{slug}': [{'text': f'Bonjour, j\'ai compris que vous cherchiez à contacter le docteur {doctor}'}]})

    for service in services:
        slug = slugify(service).replace('-', '_')
        data.responses.update({f'utter_contact_service/{slug}': [{'text': f'Bonjour, j\'ai compris que vous cherchiez à contacter le service {service}'}]})

    data.responses.update({f'utter_contact_doctor/unknown': [{'text': f'Bonjour, j\'ai compris que vous cherchiez à contacter un docteur. Je n\'ai pas compris lequel'}]})
    data.responses.update({f'utter_contact_service/unknown': [{'text': f'Bonjour, j\'ai compris que vous cherchiez à contacter un service. Je n\'ai pas compris lequel'}]})
    data.responses.update({f'utter_out_of_scope': [{'text': f'Bonjour, je ne suis pas entrainé a répondre à votre demande !'}]})
    return data

In [23]:
data = load_from_chatette(Path('output/train/output.json'))
data.persist('data')

[0m

{'training_data': 'training_data.yml'}

In [24]:
data = load_from_chatette(Path('output/test/output.json'))
data.persist('tests')

{'training_data': 'training_data.yml'}