In [1]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import pandas as pd

# define the corpus
corpus = ['This is good pizza',
        'I love Italian pizza',
        'The best pizza',
        'nice pizza',
        'Excellent pizza',
        'I love pizza',
        'The pizza was alright',
        'disgusting pineapple pizza',
        'not good pizza',
        'bad pizza',
        'very bad pizza',
        'I had better pizza']


# creating class labels for our 
labels = array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0])

output_dim = 8
pd.DataFrame({'text': corpus, 'sentiment':labels})

Using TensorFlow backend.


Unnamed: 0,text,sentiment
0,This is good pizza,1
1,I love Italian pizza,1
2,The best pizza,1
3,nice pizza,1
4,Excellent pizza,1
5,I love pizza,1
6,The pizza was alright,0
7,disgusting pineapple pizza,0
8,not good pizza,0
9,bad pizza,0


In [2]:
# we extract the vocabulary from our corpus
sentences = [voc.split() for voc in corpus]
vocabulary = set([word for sentence in sentences for word in sentence])

vocab_size = len(vocabulary)
encoded_corpus = [one_hot(d, vocab_size) for d in corpus]
encoded_corpus


[[7, 17, 17, 15],
 [6, 2, 7, 15],
 [3, 5, 15],
 [2, 15],
 [16, 15],
 [6, 2, 15],
 [3, 15, 18, 14],
 [14, 10, 15],
 [18, 17, 15],
 [12, 15],
 [15, 12, 15],
 [6, 9, 10, 15]]

In [3]:
# we now pad the documents to  
# the max length of the longest sentences
# to have an uniform length
max_length = 5
padded_docs = pad_sequences(encoded_corpus, maxlen=max_length, padding='post')
print(padded_docs)


[[ 7 17 17 15  0]
 [ 6  2  7 15  0]
 [ 3  5 15  0  0]
 [ 2 15  0  0  0]
 [16 15  0  0  0]
 [ 6  2 15  0  0]
 [ 3 15 18 14  0]
 [14 10 15  0  0]
 [18 17 15  0  0]
 [12 15  0  0  0]
 [15 12 15  0  0]
 [ 6  9 10 15  0]]


In [4]:
# model definition
model = Sequential()
model.add(Embedding(vocab_size, output_dim, input_length=max_length, name='embedding'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model

loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy * 100))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 5, 8)              160       
_________________________________________________________________
flatten_1 (Flatten)          (None, 40)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 201
Trainable params: 201
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 91.666669


In [15]:
type(model)

keras.engine.sequential.Sequential

In [14]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])


vocabulary = set(docs)

# integer encode the documents
vocab_size = len(set(docs))

encoded_corpus = [one_hot(d, vocab_size) for d in docs]
print(encoded_corpus)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_corpus, maxlen=max_length, padding='post')
print(padded_docs)

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy * 100))

[[5, 7], [5, 1], [9, 1], [2, 1], [4], [2], [1, 1], [4, 5], [1, 1], [2, 5, 7, 3]]
[[5 7 0 0]
 [5 1 0 0]
 [9 1 0 0]
 [2 1 0 0]
 [4 0 0 0]
 [2 0 0 0]
 [1 1 0 0]
 [4 5 0 0]
 [1 1 0 0]
 [2 5 7 3]]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              80        
_________________________________________________________________
flatten_2 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 113
Trainable params: 113
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 60.000002


In [16]:
import nltk
nltk.download('punkt')

tokens = nltk.word_tokenize('This is a beautiful sentence')

print(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marinamattos/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['This', 'is', 'a', 'beautiful', 'sentence']


In [17]:
pos_tagget_tokens = nltk.pos_tag(tokens)

print(pos_tagget_tokens)

[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('beautiful', 'JJ'), ('sentence', 'NN')]
