Concatenation des vecteurs issues de TfidfVectorizer+SelectFromModel(LinearSVC) avec ceux issus d'un embedding (multilingual).  

Entrainement avec des modèles machine learning sur **40000** échantillons:
  * LinearSVC(l2) est moins bon (0.71) qu'avec TfidfVectorizer seul (0.82) et semble diverger (?)
  * LightGBM a des résultats corrects (0.8176) dans un temps raisonnable (687 sec)
  * Le petit réseau de neurones est à 0.79


In [1]:
import os
import re
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt

import spacy
import nltk
from nltk.corpus import stopwords

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

OUTDIR = "out"
if not os.path.isdir(OUTDIR):
    os.mkdir(OUTDIR)


def f1_score_w(y_true, y_pred, **kwargs):
    """
    Score utilisé par Rakuten
    """
    return round(f1_score(y_true, y_pred, average='weighted'),4)
    
# A virer

spacynlp = None
spacyre = re.compile(r"(<.*?>|&#\d+;|\'|\:|\.|\-|\+)")
french_stop_words = set(stopwords.words('french'))
def tokenize_spacy(sentence):
    """
    Tokenizer basé sur spacy
    """
    global spacynlp
    if spacynlp is None:
        spacynlp = spacy.load("fr_core_news_sm")
        spacynlp.disable_pipes ('tagger', 'parser', 'ner')
    s = sentence
    t = [x.lemma_ for x in spacynlp(s)]
    #t = [x.text for x in spacynlp(s) if not x.text in french_stop_words]
    return t

def get_text(df):
    lst = []
    for desc, desi in zip(df.designation, df.description):
        desistr = desi if type(desi) == str else ''
        descstr = desc if type(desc) == str else ''
        sep = ' ' if type(desc) == str and type(desi) == str else ''
        s = desistr + sep + descstr
        s = s.lower() + ' ' + re.sub(r"([^A-Z0-9°\+\*\=]+)","",s).lower()
        lst.append(s)
    return lst

def get_clean_df(test_data=False):
    """
    Lecture d'un fichier de données, avec petit nettoyage et sauvegarde
    Utilisation de la sauvegarde si elle existe

    """
    if test_data == True:
        xfile = "X_test_update.csv"
        imdir = os.path.join("images", "image_test")
        savef = os.path.join(OUTDIR, "clean_test_df.pkl")
    else:
        xfile = "X_train_update.csv"
        imdir = os.path.join("images", "image_train")
        savef = os.path.join(OUTDIR, "clean_train_df.pkl")
    #
    if os.path.isfile(savef):
        print(f"Lecture de {savef}")
        return pd.read_pickle(savef)
    #
    print(f"Construction de {savef}")
    df = pd.read_csv(xfile).drop("Unnamed: 0", axis=1)
    df['imgpath'] = df.apply(lambda x: os.path.join(imdir,
                                                   "image_%d_product_%d.jpg" % (x.imageid,
                                                                                x.productid)),
                                                   axis=1)
    df.designation = df.designation.astype('string')
    df.description = df.description.astype('string')
    if not test_data:
        y = pd.read_csv("Y_train_CVw08PX.csv").drop("Unnamed: 0", axis=1)
        df['class'] = y.astype(str) # Needed by generator
    df.to_pickle(savef)
    return df


In [2]:
df = get_clean_df()[:40000]
X_train, X_test, y_train, y_test = train_test_split(get_text(df), df['class'],
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=51)
X_train, y_train = shuffle(X_train, y_train, random_state=52)


Lecture de out\clean_train_df.pkl


In [3]:

y_i2l = list(df['class'].value_counts().index)
y_l2i = { y_i2l[i]:i for i in range(len(y_i2l))}

y_train_i = np.array(y_train.apply(lambda x: y_l2i[x]))
y_test_i = np.array(y_test.apply(lambda x: y_l2i[x]))


print(f"TfidfVectorizer +  SelectFromModel(LinearSVC) ...")
start = time()
tfidf = TfidfVectorizer(analyzer='word',
                  #strip_accents='ascii',
                  stop_words=french_stop_words,
                  max_df=0.8,
                  min_df=1,
                  ngram_range=(1,1),
                  use_idf=False,
                  smooth_idf=False,
                  sublinear_tf=False,
                  binary=False,
                  #max_features=10000,
                  )
selector = SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                     tol=1e-4, max_iter=5000))
#selector = SelectFromModel(LinearSVC(penalty="l2", dual=True, C=0.8,
#                                     tol=1e-5, max_iter=4000))
X_train_tfidf = selector.fit_transform(tfidf.fit_transform(X_train), y_train).toarray()
X_test_tfidf = selector.transform(tfidf.transform(X_test)).toarray()
print(f"Shape (X_train_tfidf) = {X_train_tfidf.shape} ({int(time()-start)} sec)")



TfidfVectorizer +  SelectFromModel(LinearSVC) ...
Shape (X_train_tfidf) = (32000, 10927) (18 sec)


In [4]:
    
print(f"Embedding  ...")
start = time()

#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
##embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-xling/en-fr/1")
embed = hub.load("tfhub/universal-sentence-encoder-multilingual-large-3")


X_train_embed = np.array([embed(x) for x in X_train]).reshape(len(X_train),-1)
X_test_embed = np.array([embed(x) for x in X_test]).reshape(len(X_test),-1)
print(f"Shape (X_train_embed) = {X_train_embed.shape} ({int(time()-start)} sec)")



Embedding  ...
Shape (X_train_embed) = (32000, 512) (4004 sec)


In [5]:
print(f"Concatenate + Standardize ...")
start = time()
X_train_concat = np.concatenate([X_train_embed, X_train_tfidf], axis = 1)
X_test_concat = np.concatenate([X_test_embed, X_test_tfidf], axis = 1)

std = StandardScaler()
X_train_concat = std.fit_transform(X_train_concat)
X_test_concat = std.transform(X_test_concat)
print(f"Shape (X_train_concat) = {X_train_concat.shape} ({int(time()-start)} sec)")

print(f"Sequential NN ...")

clf_rn = tf.keras.Sequential([
                tf.keras.layers.Dense(128, activation='relu', input_shape=X_train_concat.shape[1:]),
                tf.keras.layers.Dropout(0.3),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dense(27, activation='softmax')
])

clf_rn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
               loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                                                  patience=2, verbose=1,
                                                  restore_best_weights=True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=2)

history = clf_rn.fit(x=X_train_concat, y=y_train_i, batch_size=32,
                     validation_split=0.2,
                     callbacks = [early_stopping, reduce_lr],
                     epochs=15)
y_test_pred_raw = clf_rn.predict(X_test_concat)
y_pred_i = np.argmax(y_test_pred_raw, axis=1)
print(f"Score rnn = {f1_score_w(y_test_i, y_pred_i)}")

print(f"Comparaisons de modèles machine learning...")

clfs = {#"RandomForestClassifier": RandomForestClassifier(),
        #"KNeighborsClassifier": KNeighborsClassifier(),
        #"LinearSVC": LinearSVC(),
        "LightGBM" : LGBMClassifier(n_estimators=150, learning_rate=0.07,
                     boosting_type='gbdt', class_weight='balanced'),
        "LinearSVC_custom": LinearSVC(penalty="l2", dual=True, C=0.8, tol=1e-5, max_iter=4000),
       }
for clfname in clfs:
    clf = clfs[clfname]
    t0 = time()
    clf.fit(X_train_concat, y_train)
    tfit = int(time() - t0)
    y_pred = clf.predict(X_test_concat)
    score = f1_score_w(y_test, y_pred)
    print(f"{clfname} w-f1-score = {score} ({tfit} sec)")

# 20000
#   LightGBM w-f1-score = 0.7904 (507 sec)  

Concatenate + Standardize ...
Shape (X_train_concat) = (32000, 11439) (19 sec)
Sequential NN ...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Restoring model weights from the end of the best epoch.
Epoch 00007: early stopping
Score rnn = 0.7903
Comparaisons de modèles machine learning...
LightGBM w-f1-score = 0.8176 (687 sec)




LinearSVC_custom w-f1-score = 0.7163 (31263 sec)
