In [36]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('german'))



In [42]:
df = pd.read_csv("data/rawdata2.csv", encoding='latin-1')

In [44]:
df.head()

Unnamed: 0,Identifier,Item_Name,Category
0,B019YGSAPW,Ravensburger Spiele 23409 - Schwarzer Peter Mitbringspiel,0300 Games & Puzzles
1,B000QGRCN6,"Henrys A01002-S01 - Yo-Yo Ersatzschnüre 6 Stück, weiß",0900 Outdoor & Sports Toys
2,B00KB45ULE,"London Teddy Bears 4X-CLQL-QFJN Sorry, es TUT Mir leid, Braun",0700 Plush
3,B07NDXZ8ZH,"KiddyMoon 90X30cm/300 Bälle ? 7Cm Bällebad Baby Spielbad Mit Bunten Bällen Rund Made In EU, Hellgrau:Perle-Blau-Baby Blau-Transparent-Silbern",0900 Outdoor & Sports Toys
4,B079QGYCS7,"Eichhorn 100003408 - Musik Set enthält Trommel, Triangel, Maraca-Ei, aus Eichenholz",1300 All Other


In [46]:
df.Item_Name.value_counts()

Folienballon im Zahlen-Design, gro&szlig;, 86,4&nbsp;cm                                                    21
Elfique Tipi INDIANERZELT DOPPELT Gepolsterter Decke und Drei Kissen                                       12
Papier Partytüten                                                                                          10
Elfique New Tipi INDIANERZELT SPIELZELT DOPPELT Gepolsterter Decke (Zelt mit Decke)                        10
Widmann - Kinderkostüm Polizistin                                                                           7
                                                                                                           ..
Bestway Crocodile Rider 1.68 m x89 cm, Schwimmtier                                                          1
FALLER 120471 - Auf- und Abfahrt komplett, Radius 2                                                         1
Simba 6315875007 - Disney Winnie The Puuh Plüsch Tigger 35 cm                                               1
Dickie Toy

In [49]:
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('german'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
df['Item_Name'] = df['Item_Name'].apply(clean_text)

In [50]:
df

Unnamed: 0,Identifier,Item_Name,Category
0,B019YGSAPW,ravensburger spiele 23409 schwarzer peter mitbringspiel,0300 Games & Puzzles
1,B000QGRCN6,henrys a01002s01 yoyo ersatzschnre 6 stck wei,0900 Outdoor & Sports Toys
2,B00KB45ULE,london teddy bears 4clqlqfjn sorry tut leid braun,0700 Plush
3,B07NDXZ8ZH,kiddymoon 9030cm 300 blle 7cm bllebad baby spielbad bunten bllen rund made eu hellgrauperleblaubaby blautransparentsilbern,0900 Outdoor & Sports Toys
4,B079QGYCS7,eichhorn 100003408 musik set enthlt trommel triangel maracaei eichenholz,1300 All Other
...,...,...,...
39495,B07SBHBR4G,animagic einhorn destiny elektronisches haustier,0700 Plush
39496,B077MYDXF1,heye 29842 oktoberfest triangular 1500 teile christoph schne green,0300 Games & Puzzles
39497,B07K8SKX73,monsterjam monsterdirtstarterset 226g monsterdirt eklusivem monster jam truck mastab 164 sortierung verschiedenen designs,0400 Vehicles
39498,B019YY2E2Y,kinetic sand 6029058 sand bo set blau,0200 Arts & Crafts


In [52]:

MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Item_Name'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 69748 unique tokens.


In [53]:
X = tokenizer.texts_to_sequences(df['Item_Name'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (39500, 250)


In [54]:
Y = pd.get_dummies(df['Category']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (39500, 17)


In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(35550, 250) (35550, 17)
(3950, 250) (3950, 17)


In [58]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(17, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 17)                1717      
Total params: 5,082,117
Trainable params: 5,082,117
Non-trainable params: 0
_________________________________________________________________
None


In [59]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 31995 samples, validate on 3555 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [60]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.911
  Accuracy: 0.771


In [272]:
items = []
items.append("Audi Jamara RMS silber")
items.append("Ravensburger Machine Learning Puzzle Lukas Huber Edition")
print(items)


['Audi Jamara RMS silber', 'Ravensburger Machine Learning Puzzle Lukas Huber Edition']


In [277]:
labels =  ['0100', '0200', '0201', '0202', '0206', '0300', '0400', '0402', '0600', '0625', '0700', '0800', '0900', '1300', '1800', '1802', '1900']
for item in items: 
    seq = tokenizer.texts_to_sequences(item)
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
    pri
    #pred = model.predict(padded)
    #print(pred, labels[np.argmax(pred)])
    
    

[[   0    0    0 ...    0    0  431]
 [   0    0    0 ...    0    0 1988]
 [   0    0    0 ...    0    0 1413]
 ...
 [   0    0    0 ...    0    0  667]
 [   0    0    0 ...    0    0  474]
 [   0    0    0 ...    0    0  361]]
[[   0    0    0 ...    0    0  361]
 [   0    0    0 ...    0    0  431]
 [   0    0    0 ...    0    0 1018]
 ...
 [   0    0    0 ...    0    0  415]
 [   0    0    0 ...    0    0 1971]
 [   0    0    0 ...    0    0  585]]


In [241]:
new_item = ["Jamara Audi RMS Silber"]
seq = tokenizer.texts_to_sequences(new_item)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels =  ['0100', '0200', '0201', '0202', '0206', '0300', '0400', '0402', '0600', '0625', '0700', '0800', '0900', '1300', '1800', '1802', '1900']
print(pred, labels[np.argmax(pred)])

[[2.4584613e-03 1.2535288e-03 1.6505759e-04 1.5129631e-04 4.2167789e-04
  3.2281402e-02 5.4336196e-01 1.1335850e-04 2.4810652e-03 3.1009337e-04
  3.3286505e-04 1.6271465e-02 8.0786094e-02 3.1230792e-01 5.8438643e-03
  2.0422442e-04 1.2556626e-03]] 0400
