# Importing libraries

### Importing all libraries

In [27]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import cv2

# sns.set()

from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer

from langid.langid import LanguageIdentifier, model
from langid.langid import set_languages
import time

from nltk.stem import WordNetLemmatizer

# Defining functions

### Feature engineering: title_descr
[ ] does the function need to have a return df?

In [20]:
def concatenate_variables(df, col1, col2, nans_to, separator, concat_col_name, to_drop = None):
    '''
    Replace NaNs in col1 and col2 with string nans_to
    Concatenate col1 and col2 using a separator string separator
    Drop columns if specified by to drop (list of columns)
    save in a new variable named by concat_col_name
    '''
    
    ## Replace NaN's in description with empty string
    df[col1] = df[col1].fillna(nans_to)
    df[col2] = df[col2].fillna(nans_to)

    ## Concatenate col1 with col2
    df[concat_col_name] = df[col1] + separator + df[col2]
    
    ## drop columns
    if to_drop is not None:
        for col in to_drop:
            df.drop(col, axis = 1, inplace = True)


### HTML parsing 

In [21]:
def html_parsing(df, col_to_parse, verbose = False):
    '''
    HTML parse and lower case text content in col_to_parse
    '''
    t0 = time.time()

    df[col_to_parse] = [BeautifulSoup(text).get_text().lower() for text in df.loc[:,col_to_parse]]

    t1 = time.time()
    
    if verbose: 
        print(f"Column '{col_to_parse}' has been successfully HTML parsed")
        print("HTML parsing takes %0.2f seconds" %(t1-t0))

### Feature engineering: Tokenization

In [22]:

def get_lemmatized_tokens(df, col_to_tokenize, tokenizer, tokenized_col, lemmatizer, uniques = False, verbose = True):
    '''
    For each row creates a list of tokens obtained from 'col_to_tokenize' column by tokenizing the text.
    Then lemmatize each word in the list, for each row.
    If unique = True, remove duplicated from each list of lemmas using set(). Keep the order of the words in list.
    Store list of lemmas in a new variable 'tokenized_col'
    '''    
    
    t0 = time.time()
    
    all_token_list = [tokenizer.tokenize(text) for text in df.loc[:,col_to_tokenize]]
    all_lemmatized_list = [ [lemmatizer.lemmatize(t) for t in token_list] for token_list in all_token_list ]

    if uniques :    
        df[tokenized_col] = [sorted( set(lemma_list), key=lemma_list.index ) for lemma_list in all_lemmatized_list ]
    else:    
        #df[tokenized_col] = [tokenizer.tokenize(text) for text in df.loc[:,col_to_tokenize]]
        df[tokenized_col] = all_lemmatized_list

    t1 = time.time()
    
    if verbose:
        print(f"Column '{col_to_tokenize}' has been successfully tokenized")
        print("Tokenization + Lemmatization takes %0.2f seconds" %(t1-t0))
    

### Get Language

In [73]:
def get_language(df, text_col, correct = False, verbose = False):
    
    ## Main language identfication
    ## instantiate identifier to get probabilities
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    
    ## identify language
    time_0 = time.time()

    languages_probas = [identifier.classify(text) for text in df[text_col]]
    
    time_1 = time.time()
        
    if correct:
        ## restricted identifier
        identifier_2 = LanguageIdentifier.from_modelstring(model, norm_probs=True)
        identifier_2.set_languages(langs=['fr','en'])
        
        for i in range(len(languages_probas)):    
            if languages_probas[i][1] < 0.9999:
                languages_probas[i] = identifier_2.classify(df.loc[i,'title_descr'])
                
    time_2 = time.time()
    
    if verbose:
        print("Main language detection takes %0.2f minutes" %((time_1 - time_0)/60) )
        if correct:
            print("Language detection correction takes %0.2f seconds" %(time_2 - time_1) )

    
    language_dict = {'language' : list(np.array(languages_probas)[:,0]),
            'lan_prob' : [float(p) for p in np.array(languages_probas)[:,1]]}
    
    df_languages = pd.DataFrame(language_dict)
    
    return df_languages
    

In [52]:
df2 = df.loc[:10,:]
df2.head()

Unnamed: 0,prdtypecode,title,description,productid,imageid,title_descr,tokens
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,olivia: personalisiertes notizbuch / 150 seite...,"[olivia, personalisiertes, notizbuch, 150, sei..."
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,journal des arts (le) n° 133 du 28/09/2001 - l...,"[journal, de, art, le, n, 133, du, 28, 09, 200..."
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...,"[grand, stylet, ergonomique, bleu, gamepad, ni..."
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,peluche donald - europe - disneyland 2000 (mar...,"[peluche, donald, europe, disneyland, 2000, ma..."
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,la guerre des tuques\nluc a des idées de grand...,"[la, guerre, de, tuques, luc, a, idées, grande..."


In [53]:
df_languages = get_language(df2, 'title_descr', correct = True, verbose = True)

Main language detection takes 0.00 minutes
Language detection correction takes 1.52 seconds


In [56]:
df_languages.loc[0,'lan_prob']

0.9999999879664759

In [24]:

def detect_language(df, text_col, correct = False, verbose = False):
    
    ## Main language identfication
    ## instantiate identifier to get probabilities
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    
    ## identify language
    time_0 = time.time()

    df[['language', 'lang_prob']] = [identifier.classify(text) for text in df[text_col]]

    
    time_1 = time.time()
        
    if correct:
        ## restricted identifier
        identifier_2 = LanguageIdentifier.from_modelstring(model, norm_probs=True)
        identifier_2.set_languages(langs=['fr','en'])
        
        for i in range(len(df)):    
            if df.loc[i,'lang_prob'] < 0.9999:
                df.loc[i,['language', 'lang_prob']] = identifier_2.classify(df.loc[i,'title_descr'])
                
    time_2 = time.time()
    
    if verbose:
        print("Main language detection takes %0.2f minutes" %((time_1 - time_0)/60) )
        if correct:
            print("Language detection correction takes %0.2f seconds" %(time_2 - time_1) )

    

### Remove stop words from tokens
[ ] before or after the token counts ???  

In function **import_stop_words**:  
[ ] Inlude other languages (if included in spacy)  

In function **remove_stop_words**:
* Option-1: Using list comprehension takes 76 seconds for the whole dataframe. DO NOT USE
* Option-2: Using a for loop to iterate the dataframe takes 39 seconds for the whole dataframe. 

[ ] Should we use set(tokens) at the end ?

In [8]:
def import_stop_words(language):
    '''
    Import list of stop words from the indicated language.
    If language is not in the list of the top 4 languages, use fr + en by default.
    '''
    
    # top 4 languages in dataset
    available = ['fr', 'en', 'de', 'it']
    
    if language == 'fr':
        from spacy.lang.fr.stop_words import STOP_WORDS as stop_fr
        return list(stop_fr)
        
    elif language == 'en':
        from spacy.lang.en.stop_words import STOP_WORDS as stop_en
        return list(stop_en)

    elif language == 'de':
        from spacy.lang.de.stop_words import STOP_WORDS as stop_de
        return list(stop_de)

    elif language == 'it':
        from spacy.lang.it.stop_words import STOP_WORDS as stop_it
        return list(stop_it)

    else:
        from spacy.lang.fr.stop_words import STOP_WORDS as stop_fr
        from spacy.lang.en.stop_words import STOP_WORDS as stop_en
        return list(stop_fr) + list(stop_en)
    

In [9]:
def remove_stop_words(df, col_to_clean, col_result, verbose = False):
    '''
    Remove the stop words from each token list in df[col_to_clean] according to the datected language df['language']
    Store the cleaned token list in a new variable df[col_result]
    If col_result result does not exist in dataframe, intialize with empty strings (object dtype)
    '''
    ## Option - 1: list comprehension 
#     t0 = time.time()
#     new_name = col_result + '_O1'
#     df[new_name] = [ [token for token in token_list if token not in import_stop_words(language)] for token_list, language in zip(df.tokens,df.language)]
#     t1 = time.time()


    ## Option - 2: iterate over the dataframe
    t2 = time.time()
    
    new_name = col_result #+ '_O2'
    if new_name not in df.columns:
        df[new_name] = ''  # initilize col as 'object' type
    
    for i, token_list, language in zip(df.index, df.tokens, df.language):
    
        stop_words = import_stop_words(language)        
        df.at[i, new_name] = [token for token in token_list if token not in stop_words]
    
    t3 = time.time()
        
    if verbose:
        #print("time taken with comprehension list %0.f seconds" %(t1-t0))
        print("time taken with for loop list %0.f seconds" %(t3-t2))

### Get token length

In [87]:
def get_token_length(df, col_with_tokens, col_with_length, verbose = False):
    '''
    Creates a new variable measuring the number of tokens in column col_with_tokens
    '''
    t0 = time.time()
    
    df[col_with_length] = [len(token_list) for token_list in df[col_with_tokens] ]
    
    t1 = time.time()
    
    if verbose:
        print("token count takes %0.2f seconds" %(t1-t0))

# Data Processing

### importing all data
[ ] Do we merge df_X or df_y or not ?  
[ ] Drop column 'target' in product_class if problematic.  

In [59]:
## features and target
df_X = pd.read_csv('./datasets/X_train_update.csv', index_col = 0)
df_y = pd.read_csv('./datasets/Y_train_CVw08PX.csv', index_col = 0)

## merge feature and target ? ---> it maybe better for preprocessing, then separate for model training
df = pd.concat([df_y,df_X], axis = 1)

## class labels
product_class = pd.read_csv('./datasets/product_class.csv', sep = ';')


### Rename variables

In [60]:
## Rename th# Importing data and librariese variable 'designation' by 'title' which is more convenient
df.rename({'designation':'title'}, axis = 1, inplace = True)
df.head()

Unnamed: 0,prdtypecode,title,description,productid,imageid
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


### Feature engineering: title_descr

In [61]:
concatenate_variables(df, 'title', 'description', nans_to = '', separator ='\n', concat_col_name = 'title_descr', \
                      to_drop = None)
df.head()

Unnamed: 0,prdtypecode,title,description,productid,imageid,title_descr
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,La Guerre Des Tuques\nLuc a des id&eacute;es d...


### HTML parse & lower case

In [62]:
html_parsing(df, 'title_descr', verbose = True)

  df[col_to_parse] = [BeautifulSoup(text).get_text().lower() for text in df.loc[:,col_to_parse]]


Column 'title_descr' has been successfully HTML parsed
HTML parsing takes 11.52 seconds


In [63]:
df.head()

Unnamed: 0,prdtypecode,title,description,productid,imageid,title_descr
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,olivia: personalisiertes notizbuch / 150 seite...
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,journal des arts (le) n° 133 du 28/09/2001 - l...
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,peluche donald - europe - disneyland 2000 (mar...
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,la guerre des tuques\nluc a des idées de grand...


### Lemmatize & Tokenize text
* if uniques = True, get the unique tokens (no duplicates). Keeps order.

In [64]:
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

get_lemmatized_tokens(df, 'title_descr', tokenizer, 'tokens', lemmatizer, uniques = True)

Column 'title_descr' has been successfully tokenized
Tokenization + Lemmatization takes 39.23 seconds


In [66]:
df.head()

Unnamed: 0,prdtypecode,title,description,productid,imageid,title_descr,tokens
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,olivia: personalisiertes notizbuch / 150 seite...,"[olivia, personalisiertes, notizbuch, 150, sei..."
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,journal des arts (le) n° 133 du 28/09/2001 - l...,"[journal, de, art, le, n, 133, du, 28, 09, 200..."
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...,"[grand, stylet, ergonomique, bleu, gamepad, ni..."
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,peluche donald - europe - disneyland 2000 (mar...,"[peluche, donald, europe, disneyland, 2000, ma..."
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,la guerre des tuques\nluc a des idées de grand...,"[la, guerre, de, tuques, luc, a, idées, grande..."


### Feature engineering: get language
* Language detection takes about 5 mins.  DO NOT RE DO IT IF NOT NEEDED.


In [74]:

language_df = get_language(df, 'title_descr', correct = True, verbose = True)


Main language detection takes 4.69 minutes
Language detection correction takes 3.36 seconds


In [77]:
write = False
if write:
    filename = 'df_languages_probas' + '.csv'
    language_df.to_csv(filename, header=True, index=False)

In [81]:
read = True
if read:
    df3 = pd.read_csv(filename)
    display(df3.head())

Unnamed: 0,language,lan_prob
0,de,1.0
1,fr,1.0
2,fr,1.0
3,fr,1.0
4,fr,1.0


In [82]:
## concatenate to main dataframe
df = pd.concat([df, language_df], axis = 1)
df.head()

Unnamed: 0,prdtypecode,title,description,productid,imageid,title_descr,tokens,language,lan_prob
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,olivia: personalisiertes notizbuch / 150 seite...,"[olivia, personalisiertes, notizbuch, 150, sei...",de,1.0
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,journal des arts (le) n° 133 du 28/09/2001 - l...,"[journal, de, art, le, n, 133, du, 28, 09, 200...",fr,1.0
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...,"[grand, stylet, ergonomique, bleu, gamepad, ni...",fr,1.0
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,peluche donald - europe - disneyland 2000 (mar...,"[peluche, donald, europe, disneyland, 2000, ma...",fr,1.0
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,la guerre des tuques\nluc a des idées de grand...,"[la, guerre, de, tuques, luc, a, idées, grande...",fr,1.0


### Remove stop words

In [89]:
remove_stop_words(df, 'tokens', 'tokens', verbose = True)

time taken with for loop list 26 seconds


In [90]:
df.head()

Unnamed: 0,prdtypecode,title,description,productid,imageid,title_descr,tokens,language,lan_prob,tokens_st,text_token_len
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,olivia: personalisiertes notizbuch / 150 seite...,"[olivia, personalisiertes, notizbuch, 150, sei...",de,1.0,"[olivia, personalisiertes, notizbuch, 150, sei...",11
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,journal des arts (le) n° 133 du 28/09/2001 - l...,"[journal, art, n, 133, 28, 09, 2001, l, marche...",fr,1.0,"[journal, art, n, 133, 28, 09, 2001, l, marche...",34
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...,"[grand, stylet, ergonomique, bleu, gamepad, ni...",fr,1.0,"[grand, stylet, ergonomique, bleu, gamepad, ni...",79
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,peluche donald - europe - disneyland 2000 (mar...,"[peluche, donald, europe, disneyland, 2000, ma...",fr,1.0,"[peluche, donald, europe, disneyland, 2000, ma...",8
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,la guerre des tuques\nluc a des idées de grand...,"[guerre, tuques, luc, idées, grandeur, veut, o...",fr,1.0,"[guerre, tuques, luc, idées, grandeur, veut, o...",30


### Feature engineering: token_length
[ ] before or after removing stop words ?  
[ ] before or after removing short tokens ?

In [91]:
get_token_length(df, 'tokens', 'text_token_len', verbose = True)
df.head()

token count takes 0.04 seconds


Unnamed: 0,prdtypecode,title,description,productid,imageid,title_descr,tokens,language,lan_prob,tokens_st,text_token_len
0,10,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,olivia: personalisiertes notizbuch / 150 seite...,"[olivia, personalisiertes, notizbuch, 150, sei...",de,1.0,"[olivia, personalisiertes, notizbuch, 150, sei...",11
1,2280,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,journal des arts (le) n° 133 du 28/09/2001 - l...,"[journal, art, n, 133, 28, 09, 2001, l, marche...",fr,1.0,"[journal, art, n, 133, 28, 09, 2001, l, marche...",25
2,50,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...,"[grand, stylet, ergonomique, bleu, gamepad, ni...",fr,1.0,"[grand, stylet, ergonomique, bleu, gamepad, ni...",55
3,1280,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,peluche donald - europe - disneyland 2000 (mar...,"[peluche, donald, europe, disneyland, 2000, ma...",fr,1.0,"[peluche, donald, europe, disneyland, 2000, ma...",7
4,2705,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,la guerre des tuques\nluc a des idées de grand...,"[guerre, tuques, luc, idées, grandeur, veut, o...",fr,1.0,"[guerre, tuques, luc, idées, grandeur, veut, o...",18


# Exporting preprocessed text

In [92]:
write = True
if write:
    filename = 'df_text_preprocessed' + '.csv'
    df.to_csv(filename, header=True, index=False)

### Most Common words per category 
[ ] how to include this information ? in the dataframe or a model ?

### Feature engineering: Statistical indicators per category
These are the mean, min, max, etc of text_token_len per category.

[ ] How to include them in the dataframe ???