In [1]:
import pandas as pd
import re

In [2]:
x_path = "X_train_update.csv"
y_path = "Y_train_CVw08PX.csv"


x_df = pd.read_csv(x_path)
y_df = pd.read_csv(y_path)

In [3]:
print(x_df.shape)
x_df.head(5)

(84916, 5)


Unnamed: 0.1,Unnamed: 0,designation,description,productid,imageid
0,0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


In [4]:
x_df.columns

Index(['Unnamed: 0', 'designation', 'description', 'productid', 'imageid'], dtype='object')

In [5]:
# drop useless columns
x_df.drop(columns=['Unnamed: 0', 'description', 'productid', 'imageid'], axis=0, inplace=True)
x_df.head(5)

Unnamed: 0,designation
0,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,La Guerre Des Tuques


### Tokenization
At first we clean the accent using the function normalize accent

In [19]:
def normalize_accent(string):
    string = string.replace('á', 'a')
    string = string.replace('â', 'a')

    string = string.replace('é', 'e')
    string = string.replace('è', 'e')
    string = string.replace('ê', 'e')
    string = string.replace('ë', 'e')

    string = string.replace('î', 'i')
    string = string.replace('ï', 'i')

    string = string.replace('ö', 'o')
    string = string.replace('ô', 'o')
    string = string.replace('ò', 'o')
    string = string.replace('ó', 'o')

    string = string.replace('ù', 'u')
    string = string.replace('û', 'u')
    string = string.replace('ü', 'u')

    string = string.replace('ç', 'c')
    string = re.sub(r'°|#|/|-|%|_|.|[0-9]+', '', string)
    
    return string

In [20]:
normalize_accent('journal arts n° 133 28/09/2001 art marche salo...')

'journal arts n   art marche salo...'

In [7]:
import spacy

# Load spaCy for french
spacy_nlp = spacy.load("fr")

In [8]:
def raw_to_tokens(raw_string, spacy_nlp):
    # Write code for lower-casing
    string = raw_string.lower().rstrip()
    
    # Write code to normalize the accents
    string = normalize_accent(string)
        
    # Write code to tokenize
    spacy_tokens = spacy_nlp(string)
        
    # Write code to remove punctuation tokens and create string tokens
    string_tokens = [token.orth_ for token in spacy_tokens if not token.is_punct if not token.is_stop]
    
    # Write code to join the tokens back into a single string
    clean_string = " ".join(string_tokens)
    
    return clean_string

In [21]:
x_df['tokens'] = x_df['designation'].apply(lambda s : raw_to_tokens(s,spacy_nlp))

In [22]:
x_df.head()

Unnamed: 0,designation,tokens
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,olivia personalisiertes notizbuch seiten ...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,journal arts n art marche salon art asiat...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,grand stylet ergonomique bleu gamepad nintendo...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,peluche donald europe disneyland marionn...
4,La Guerre Des Tuques,guerre tuques


In [23]:
x_df.to_csv('./tokens.csv')

In [19]:
x_df = pd.read_csv('tokens.csv')
y_df = pd.read_csv('Y_train_CVw08PX.csv')
x_df.tail()

Unnamed: 0.1,Unnamed: 0,designation,tokens
84911,84911,The Sims [ Import Anglais ],the sims import anglais
84912,84912,Kit piscine acier NEVADA déco pierre Ø 3.50m x...,kit piscine acier nevada deco pierre ø .m x .m
84913,84913,Journal Officiel De La Republique Francaise N°...,journal officiel republique francaise n c...
84914,84914,Table Basse Bois De Récupération Massif Base B...,table basse bois recuperation massif base blan...
84915,84915,Gomme De Collection 2 Gommes Pinguin Glace Ver...,gomme collection gommes pinguin glace vert o...


In [26]:
y_df.prdtypecode.value_counts()

2583    10209
1560     5073
1300     5045
2060     4993
2522     4989
1280     4870
2403     4774
2280     4760
1920     4303
1160     3953
1320     3241
10       3116
2705     2761
1140     2671
2582     2589
40       2508
2585     2496
1302     2491
1281     2070
50       1681
2462     1421
2905      872
60        832
2220      824
1301      807
1940      803
1180      764
Name: prdtypecode, dtype: int64

In [33]:
# drop na
x_df, y_df = x_df[ ~ x_df.tokens.isna()], y_df[~ x_df.tokens.isna()]
print(x_df.shape, y_df.shape)

(84905, 3) (84905, 2)


In [34]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(x_df['tokens'])

In [35]:

print("Shape of the TF-IDF Matrix:")
print(X.shape)
print(tfidf.get_feature_names()[5000:5050])

Shape of the TF-IDF Matrix:
(84905, 63795)
['barenreiter', 'baretto', 'barf', 'bargantua', 'barge', 'barges', 'bari', 'baril', 'barilla', 'barille', 'bariolee', 'barista', 'barjavel', 'barjot', 'barker', 'barking', 'barkley', 'barletta', 'barley', 'barlow', 'barma', 'barmherzigkeit', 'barn', 'barna', 'barnabas', 'barnabedauvister', 'barnaby', 'barnathrum', 'barnes', 'barney', 'barnowsky', 'barnum', 'barnyard', 'barocco', 'barocklutherischer', 'baroin', 'barometre', 'baron', 'baronies', 'baronne', 'baronnie', 'barons', 'baroque', 'barotrauma', 'baroudeurs', 'barque', 'barquette', 'barr', 'barra', 'barrabas']


In [37]:
from sklearn.ensemble import AdaBoostClassifier


model = AdaBoostClassifier()

In [39]:
res = model.fit(X, y_df.prdtypecode)

In [41]:

model.score(X, y_df.prdtypecode)

0.16597373535127496