# Text Treatment

L'objectif de ce notebook est de génerer plusieurs X_train avec des traitement de texte differents. Ces textes vont être utilisés afin de savoir quel est le meilleur traitement.

In [171]:
import pandas as pd
from datetime import datetime
import unidecode
import re   

#Création fonctions de netoyage de texte
import spacy #https://spacy.io/usage  ----> POUR INSTALLER
#https://www.stat4decision.com/fr/traitement-langage-naturel-francais-tal-nlp/

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize


In [83]:
#importer les données texte

pd.set_option('display.max_rows', 500)
df_X = pd.read_csv(r'C:\Users\Edgar\Documents\Rakuten\X_train_update.csv',index_col =0)
df_X.head(20)

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786
5,Afrique Contemporaine N° 212 Hiver 2004 - Doss...,,5862738,393356830
6,Christof E: Bildungsprozessen Auf Der Spur,,91920807,907794536
7,Conquérant Sept Cahier Couverture Polypro 240 ...,CONQUERANT CLASSIQUE Cahier 240 x 320 mm seyès...,344240059,999581347
8,Puzzle Scooby-Doo Avec Poster 2x35 Pieces,,4239126071,1325918866
9,Tente Pliante V3s5-Pro Pvc Blanc - 3 X 4m50 - ...,Tente pliante V3S5 Pro PVC 500 gr/m² - 3 x 4m5...,3793572222,1245644185


On va mettre designation et description dans une même string:

In [76]:
#Merger les colones texte en une seule
X = df_X.designation.astype(str) + ' ' + df_X.description.astype(str)

#Numero de lignes du texte
size_X = X.size
print('Num lignes totale:', size_X)

X.head(20)

Num lignes totale: 84916


0     Olivia: Personalisiertes Notizbuch / 150 Seite...
1     Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2     Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3     Peluche Donald - Europe - Disneyland 2000 (Mar...
4     La Guerre Des Tuques Luc a des id&eacute;es de...
5     Afrique Contemporaine N° 212 Hiver 2004 - Doss...
6        Christof E: Bildungsprozessen Auf Der Spur nan
7     Conquérant Sept Cahier Couverture Polypro 240 ...
8         Puzzle Scooby-Doo Avec Poster 2x35 Pieces nan
9     Tente Pliante V3s5-Pro Pvc Blanc - 3 X 4m50 - ...
10    Eames Inspired Sxw Chair - Pink - Black The ti...
11    Fauteuil Chesterfield Brenton 100% Cuir De Buf...
12    Peaceable Kingdom Wheres Bear? The Hide And Fi...
13    Paire De Voilages Imprimés Fantaisie Paire de ...
14    Matelas Mémoire De Forme 180x200 X 20 Cm Très ...
15    Zenith Pince Agrafeuse 591 N°10 Coloris Noir P...
16    Walter Scott Oeuvres Complètes Tomes 3456 10 E...
17              Mod Podge Dishwasher Safe Gloss 

Definition des differentes fonctions à utiliser pour le traitement de texte:

In [253]:
def tokeniser_spacy(sentence,language): #Tokenisation par phrases
    
    langue = {'french':'fr_core_news_sm',
          'german':'de_core_news_sm',
          'english':'en_core_web_sm'}
    
    #nlp = spacy.load(langue[language])
    # Tokeniser la phrase
    tokens = nlp(sentence)
    # Retourner le texte coupé par mots(token)
    return [token for token in tokens]

def tokeniser_nltk(sentence,language): #Tokenisation par phrases
    
    tokens = word_tokenize(text, language=language)

    # Retourner le texte coupé par mots(token)
    return [token for token in tokens]

def language_(language):
    langue = {'fr':'french',
             'de':'german',
             'en':'english'}
    return langue[language]

def stop_words(sentence,language): #Enleve stop words per phrase
    clean_words = ''
    stop_words =stopwords.words(language)
    tokens = tokeniser_spacy(sentence,language)
    for token in tokens:
        if token.text not in stop_words:
            clean_words += token.text + ' '
    return clean_words

def number_unique_words(text_series): #count unique words in Series
    words = []
    for sentence in text_series.str.split():
        for word in sentence:
            words.append(word)
            
    return len(list(set(words)))

def remove_accents(sentece,language): #removes accents
    #language is not used in this function, but needs to be added on the definition as the rest
    try:
        text = unidecode.unidecode(sentece)
    except:
        pass
    return text

def steem(sentence,language): #Steeming
    steemed = ''
    tokens = tokeniser_spacy(sentence,language)
    stemmer = SnowballStemmer(language=language)
    
    for token in tokens:
        steemed += stemmer.stem(token.text) +' '
    return steemed

def lemma(sentence,language): #Lemmatizer
    lemma = ''
    tokens = tokeniser_spacy(sentence,language)
    
    for token in tokens:
        lemma += token.lemma_ + ' '
    return lemma

def no_num(sentence,language): #delete numbers
    #language is not used in this function, but needs to be added on the definition as the rest
    return ''.join([word for word in sentence if not word.isdigit()])

def no_special(sentence,language): #delete special characters
    #language is not used in this function, but needs to be added on the definition as the rest
    return re.sub(r"[^a-zA-Z0-9]+", ' ', sentence)

Fonction qui va génerer les differents combinations de traitement de texte:

In [257]:
# Ce code divise le dataframe en batchs afin de voir l'avancement du traitement à chaque fin de batch

def treat_text(text_series,text_treat_funct,language,batch,old_treatments=''):
    
    treatments = '_' + text_treat_funct.__name__ + '-' + language.upper()
    language = language_(language)
    
    langue = {'french':'fr_core_news_sm',
              'german':'de_core_news_sm',
              'english':'en_core_web_sm'}
    
    global nlp
    nlp = spacy.load(langue[language])
    
    print('************************************************')
    print('')
    print(text_treat_funct.__name__.upper(),language.upper())
    print('')
    print("Treatment Start Time =", datetime.now().strftime("%H:%M:%S"))
    print("Number of unique words at start =", number_unique_words(text_series))
    #print('Type of function:', text_treat_funct.__name__)
    #print('Treatment language choosen:', language)
    print('Number of batch:', batch)
    
    size = text_series.size
    treated_text = pd.Series()
    text = text_series.str.lower()
    
    #treat batches
    for i in range(batch):
        treated_batch = text[int(size/batch) * i:int(size/batch) * (i + 1)].apply(lambda cell: text_treat_funct(cell,language))
        treated_text = pd.concat([treated_text,treated_batch])
        
        treated_text.to_csv(r'C:\Users\Edgar\Documents\Rakuten\batches\X_train' + old_treatments + treatments + '_batch-' + str(i) + '.csv') 
        
        print('Treatment Batch', i, 'from', batch ,'; Time =', datetime.now().strftime("%H:%M:%S"))
    
    #tret last batch
    treated_batch = text[int(size/batch) * (i + 1):size].apply(lambda cell: text_treat_funct(cell,language))
    treated_text = pd.concat([treated_text,treated_batch])
    
    treated_text.to_csv('X_train' + old_treatments + treatments + '.csv') 
    
    print("Number of unique words at end =", number_unique_words(treated_text))
    print('')
    
    return treated_text, old_treatments + treatments

#X_test = X[:23]
#treat_text(X_test,no_special,'french',7)

Cette suite de lignes génere les differentes combinations de textes traités:

In [258]:
batch = 5

X_treated, treatments = treat_text(X,lemma,'fr',batch)
X_treated, treatments = treat_text(X_treated,stop_words,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,no_num,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,remove_accents,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,no_special,'fr',batch,treatments)

X_treated, treatments = treat_text(X_treated,lemma,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,stop_words,'en',batch,treatments)

X_treated, treatments = treat_text(X_treated,stop_words,'de',batch,treatments)
X_treated, treatments = treat_text(X_treated,lemma,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,steem,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,steem,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,steem,'de',batch,treatments)

************************************************

LEMMA FRENCH

Treatment Start Time = 18:03:19
Number of unique words at start = 407222
Number of batch: 5


  treated_text = pd.Series()


Treatment Batch 0 from 5 ; Time = 18:10:11
Treatment Batch 1 from 5 ; Time = 18:16:23
Treatment Batch 2 from 5 ; Time = 18:22:26
Treatment Batch 3 from 5 ; Time = 18:28:25
Treatment Batch 4 from 5 ; Time = 18:34:24
Number of unique words at end = 250363

************************************************

STOP_WORDS FRENCH

Treatment Start Time = 18:34:29
Number of unique words at start = 250363
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 18:41:33
Treatment Batch 1 from 5 ; Time = 18:47:56
Treatment Batch 2 from 5 ; Time = 18:54:05
Treatment Batch 3 from 5 ; Time = 19:00:13
Treatment Batch 4 from 5 ; Time = 19:06:18
Number of unique words at end = 248672

************************************************

NO_NUM FRENCH

Treatment Start Time = 19:06:22
Number of unique words at start = 248672
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 19:06:26
Treatment Batch 1 from 5 ; Time = 19:06:27
Treatment Batch 2 from 5 ; Time = 19:06:29
Treatment Batch 3 from 5 ; Time = 19:06:30


Encore d'autres combinations:

In [259]:
batch = 5

X_treated, treatments = treat_text(X,steem,'fr',batch)
X_treated, treatments = treat_text(X_treated,steem,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,steem,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,stop_words,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,stop_words,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,stop_words,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,no_num,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,remove_accents,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,no_special,'fr',batch,treatments)

X_treated, treatments = treat_text(X_treated,lemma,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,lemma,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,lemma,'de',batch,treatments)


************************************************

STEEM FRENCH

Treatment Start Time = 21:39:00
Number of unique words at start = 407222
Number of batch: 5


  treated_text = pd.Series()


Treatment Batch 0 from 5 ; Time = 21:46:49
Treatment Batch 1 from 5 ; Time = 21:53:57
Treatment Batch 2 from 5 ; Time = 22:00:42
Treatment Batch 3 from 5 ; Time = 22:07:22
Treatment Batch 4 from 5 ; Time = 22:14:04
Number of unique words at end = 222055

************************************************

STEEM ENGLISH

Treatment Start Time = 22:14:08
Number of unique words at start = 222055
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 22:20:04
Treatment Batch 1 from 5 ; Time = 22:25:57
Treatment Batch 2 from 5 ; Time = 22:31:43
Treatment Batch 3 from 5 ; Time = 22:37:32
Treatment Batch 4 from 5 ; Time = 22:43:20
Number of unique words at end = 211088

************************************************

STEEM GERMAN

Treatment Start Time = 22:43:25
Number of unique words at start = 211088
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 22:49:09
Treatment Batch 1 from 5 ; Time = 22:54:49
Treatment Batch 2 from 5 ; Time = 23:00:24
Treatment Batch 3 from 5 ; Time = 23:06:05
Treat

Et encore:

In [260]:
batch = 5

X_treated, treatments = treat_text(X,steem,'fr',batch)
X_treated, treatments = treat_text(X_treated,steem,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,steem,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,lemma,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,lemma,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,lemma,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,stop_words,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,stop_words,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,stop_words,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,no_num,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,remove_accents,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,no_special,'fr',batch,treatments)


************************************************

STEEM FRENCH

Treatment Start Time = 01:41:38
Number of unique words at start = 407222
Number of batch: 5


  treated_text = pd.Series()


Treatment Batch 0 from 5 ; Time = 01:50:07
Treatment Batch 1 from 5 ; Time = 01:57:17
Treatment Batch 2 from 5 ; Time = 02:04:07
Treatment Batch 3 from 5 ; Time = 02:10:56
Treatment Batch 4 from 5 ; Time = 02:17:42
Number of unique words at end = 222055

************************************************

STEEM ENGLISH

Treatment Start Time = 02:17:47
Number of unique words at start = 222055
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 02:23:48
Treatment Batch 1 from 5 ; Time = 02:29:44
Treatment Batch 2 from 5 ; Time = 02:35:35
Treatment Batch 3 from 5 ; Time = 02:41:27
Treatment Batch 4 from 5 ; Time = 02:47:17
Number of unique words at end = 211088

************************************************

STEEM GERMAN

Treatment Start Time = 02:47:22
Number of unique words at start = 211088
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 02:53:12
Treatment Batch 1 from 5 ; Time = 02:59:01
Treatment Batch 2 from 5 ; Time = 03:04:42
Treatment Batch 3 from 5 ; Time = 03:10:25
Treat

...Et encore

In [262]:
batch = 5

X_treated, treatments = treat_text(X,lemma,'fr',batch)
X_treated, treatments = treat_text(X_treated,lemma,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,lemma,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,steem,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,steem,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,steem,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,stop_words,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,stop_words,'en',batch,treatments)
X_treated, treatments = treat_text(X_treated,stop_words,'de',batch,treatments)

X_treated, treatments = treat_text(X_treated,no_num,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,remove_accents,'fr',batch,treatments)
X_treated, treatments = treat_text(X_treated,no_special,'fr',batch,treatments)

************************************************

LEMMA FRENCH

Treatment Start Time = 07:30:56
Number of unique words at start = 407222
Number of batch: 5


  treated_text = pd.Series()


Treatment Batch 0 from 5 ; Time = 07:38:09
Treatment Batch 1 from 5 ; Time = 07:44:33
Treatment Batch 2 from 5 ; Time = 07:50:41
Treatment Batch 3 from 5 ; Time = 07:56:41
Treatment Batch 4 from 5 ; Time = 08:02:41
Number of unique words at end = 250363

************************************************

LEMMA ENGLISH

Treatment Start Time = 08:02:46
Number of unique words at start = 250363
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 08:08:24
Treatment Batch 1 from 5 ; Time = 08:13:53
Treatment Batch 2 from 5 ; Time = 08:19:17
Treatment Batch 3 from 5 ; Time = 08:24:42
Treatment Batch 4 from 5 ; Time = 08:30:10
Number of unique words at end = 242904

************************************************

LEMMA GERMAN

Treatment Start Time = 08:30:15
Number of unique words at start = 242904
Number of batch: 5
Treatment Batch 0 from 5 ; Time = 08:35:44
Treatment Batch 1 from 5 ; Time = 08:41:11
Treatment Batch 2 from 5 ; Time = 08:46:31
Treatment Batch 3 from 5 ; Time = 08:51:53
Treat