# Natural Language Processing with Embeddig Layer

In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, BatchNormalization, Dense, Embedding, Flatten, LSTM
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Esempio classificazione testi

In [3]:
# define documents
docs = ['Well done!','Good work','Great effort','nice work','Excellent!','Weak','Poor effort!','not good','poor work','Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])

In [4]:
for x,y in zip(docs, labels):
    print(f"Frase: {x}  ->  Classe: {y}")

Frase: Well done!  ->  Classe: 1
Frase: Good work  ->  Classe: 1
Frase: Great effort  ->  Classe: 1
Frase: nice work  ->  Classe: 1
Frase: Excellent!  ->  Classe: 1
Frase: Weak  ->  Classe: 0
Frase: Poor effort!  ->  Classe: 0
Frase: not good  ->  Classe: 0
Frase: poor work  ->  Classe: 0
Frase: Could have done better.  ->  Classe: 0


### Preprocessing dei dati

Le parole verranno trasformare in uno scalare, non più in un vettore di "uno" e "zeri" il cui indice rappresenta la parola.

In [5]:
vocab_size = 14
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

[[5, 8], [8, 4], [5, 2], [7, 4], [2], [11], [12, 2], [7, 8], [12, 4], [2, 13, 8, 3]]


I dati, come per un normale dataset, devono avere lo stesso numero di features. Per far ciò si procede con il padding, ovvero aggiungendo "zeri" fino a raggiungere la lunghezza necessaria.

In [6]:
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 5  8  0  0]
 [ 8  4  0  0]
 [ 5  2  0  0]
 [ 7  4  0  0]
 [ 2  0  0  0]
 [11  0  0  0]
 [12  2  0  0]
 [ 7  8  0  0]
 [12  4  0  0]
 [ 2 13  8  3]]


## Embedding Layer

A questo punto è possibile costruire il modello.

In [7]:
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
####################
model.add(Flatten())
####################
model.add(Dense(1, activation='sigmoid'))

In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 8)              112       
_________________________________________________________________
flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 145
Trainable params: 145
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
model.fit(padded_docs, labels, epochs=100, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7feea0187dc0>

In [11]:
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %0.2f' % (accuracy*100))

Accuracy: 90.00


In [12]:
padded_docs.shape

(10, 4)

In [15]:
labels.shape

(10,)

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, 7, input_length=4))
###################
model.add(LSTM(32))
###################
model.add(Dense(1, activation='sigmoid'))

In [17]:
model.compile(optimizer='RMSprop', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
model.fit(padded_docs, labels, epochs=50, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7fee8c6c3b50>

In [19]:
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %0.2f' % (accuracy*100))

Accuracy: 100.00


## Introduzione a GloVe