In [1]:
import spacy
import random
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.util import minibatch
from spacy.util import compounding


In [2]:
!python3 -m spacy download fr_core_news_sm
import fr_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [3]:
nlp = fr_core_news_sm.load()

In [4]:
training_df = pd.read_json('data/training_set.json')
training_df.head()

Unnamed: 0,intent,sentence
0,irrelevant,"850€ maximum pour le loyer, à partir de janvie..."
1,irrelevant,D'imprimer
2,purchase,Le meilleur cabriolet hybrid moins de 5m10 min...
3,find-hotel,en ce moment je cher un location pour les vaca...
4,irrelevant,c'est possible de t'utiliser la nuit ?


In [5]:
training_df.shape

(6035, 2)

In [6]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6035 entries, 0 to 6034
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   intent    6035 non-null   object
 1   sentence  6035 non-null   object
dtypes: object(2)
memory usage: 94.4+ KB


In [7]:
training_df["intent"].value_counts()

irrelevant           3852
purchase              613
find-restaurant       469
find-around-me        383
find-hotel            316
find-train            143
find-flight           142
provide-showtimes     117
Name: intent, dtype: int64

In [8]:
train_x = training_df["sentence"]
train_y = training_df["intent"]

In [9]:
stop_words = list(STOP_WORDS)

In [10]:
punct = string.punctuation

In [11]:
emoticon_regex = re.compile(r"""(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)""")

In [12]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stop_words and token not in punct and emoticon_regex.match(token) == None :
            cleaned_tokens.append(token)
    return cleaned_tokens
text_data_cleaning("   Je veux bien , tu peux trouver un site de carte postale pour la fête des mères :)")

['vouloir', 'pouvoir', 'trouver', 'site', 'carte', 'postal', 'fête', 'mère']

In [13]:
def df2list(text_df, label_df):
    res_list = [(text_df.iloc[i], {'cats': label_df.iloc[i]}) for i in range(len(text_df))]
    return res_list

In [14]:
training_list = df2list(train_x, train_y)

In [15]:
text_cat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(text_cat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

In [16]:
for i in set(training_df["intent"]):
    text_cat.add_label(i)

In [None]:
# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(training_list)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(training_list, size=1):
        texts = [nlp(text) for text, entities in batch]
        annotations = [{"cats": entities} for text, entities in batch]
        nlp.update(texts, annotations, losses=losses)
    if itn % 20 == 0:
        print(losses)