<a href="https://colab.research.google.com/github/ProfAI/tf00/blob/master/9%20-%20Word%20Embedding/trained_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Dropout

In [46]:
MAX_WORDS = 10000
NUM_EMBEDDING = 50
SEQ_MAX_LENGTH = 50

In [47]:
import os
from sklearn.utils import shuffle
import subprocess


def load_imdb(files_path, labels=["pos", "neg"]):
    
    if(not os.path.isfile("aclImdb_v1.tar.gz")):
      os.system("wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
      os.system("tar -xf aclImdb_v1.tar.gz")
    
    label_map = {labels[0]:1, labels[1]:0}
    
    reviews = []
    y = []
    
    for label in labels:
      path = files_path+label
      for file in os.listdir(path):
        review_file = open(path+"/"+file)
        review = review_file.read()    
        
        reviews.append(review)
        y.append(label_map[label])
        
    # la funzione shuffle di sklearn ci permette di
    # mescolare più array allo stesso modo
    
    reviews, y = shuffle(reviews,y)
    
    return(reviews,y)

In [48]:
reviews_train, y_train = load_imdb("aclImdb/train/")
reviews_test, y_test = load_imdb("aclImdb/test/")

print("Prima recensione del set di test")
print(reviews_test[0])
print("Sentiment: %d" % y_test[0])

Prima recensione del set di test
This film as it is now is far shorter than it was when released in 1918. In fact, it is now more available with two other medium sized silent Chaplin features (A DOG'S LIFE, and THE PILGRIM) that Chaplin re-released in the 1950s. In it's day SHOULDER ARMS was a big hit because of it's humor in uniform approach. It still is very funny (Chaplin in disguise as a tree, spying on the Germans, is so ridiculous it's hysterical), but it suffers from being set in it's own age. Charlie's dealing with World War I, a hideous conflict that killed 20 million people, but not the worst war (horrible to say) of the 20th Century. Chaplin would live to see that war too, and would spoof it's main architects in THE GREAT DICTATOR. But the latter is more accessible to modern audiences because that movie is a talking picture. Also, Hitler as a target seems more important to audiences in 2008 than Kaiser Wilhelm II and his general staff.<br /><br />SHOULDER ARMS was to take us

In [49]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(reviews_train)

X_train = tokenizer.texts_to_sequences(reviews_train)
X_test = tokenizer.texts_to_sequences(reviews_test)

X_test[0]

[11,
 19,
 14,
 9,
 6,
 147,
 6,
 227,
 5850,
 71,
 9,
 13,
 51,
 622,
 8,
 8,
 189,
 9,
 6,
 147,
 50,
 1434,
 16,
 104,
 82,
 3461,
 6388,
 1290,
 3499,
 941,
 3,
 110,
 2,
 1,
 12,
 3499,
 792,
 622,
 8,
 1,
 3064,
 8,
 42,
 248,
 5400,
 2795,
 13,
 3,
 191,
 566,
 85,
 4,
 42,
 483,
 8,
 6352,
 1480,
 9,
 128,
 6,
 52,
 160,
 3499,
 8,
 5628,
 14,
 3,
 2841,
 20,
 1,
 4606,
 6,
 35,
 645,
 42,
 3772,
 18,
 9,
 2476,
 36,
 109,
 267,
 8,
 42,
 202,
 555,
 7307,
 1950,
 16,
 179,
 322,
 10,
 3,
 4238,
 1942,
 12,
 554,
 888,
 1428,
 81,
 18,
 21,
 1,
 246,
 322,
 524,
 5,
 132,
 4,
 1,
 3648,
 1115,
 3499,
 59,
 409,
 5,
 64,
 12,
 322,
 96,
 2,
 59,
 2833,
 42,
 290,
 8,
 1,
 84,
 8461,
 18,
 1,
 1563,
 6,
 50,
 6338,
 5,
 679,
 1218,
 85,
 12,
 17,
 6,
 3,
 660,
 428,
 79,
 2143,
 14,
 3,
 2391,
 183,
 50,
 671,
 5,
 1218,
 8,
 5047,
 71,
 1532,
 2,
 24,
 828,
 3992,
 7,
 7,
 5400,
 2795,
 13,
 5,
 190,
 175,
 140,
 1,
 4,
 1,
 8332,
 24,
 2330,
 24,
 394,
 358,
 5,
 2,
 24,
 776,


In [50]:
longest_review = max(X_train,key=len)
shortest_review = min(X_train,key=len)

print("La review più lunga ha %d parole" % len(longest_review))
print("La review più corta ha %d parole" % len(shortest_review))

La review più lunga ha 2193 parole
La review più corta ha 9 parole


In [51]:
X_train = pad_sequences(X_train, maxlen = SEQ_MAX_LENGTH)
X_test = pad_sequences(X_test, maxlen = SEQ_MAX_LENGTH)

X_train.shape

(25000, 50)

In [52]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [55]:
model = Sequential()

model.add(Embedding(MAX_WORDS, NUM_EMBEDDING, input_length=SEQ_MAX_LENGTH))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='relu'))

model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 50)            500000    
_________________________________________________________________
dropout_12 (Dropout)         (None, 50, 50)            0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 2500)              0         
_________________________________________________________________
dense_26 (Dense)             (None, 128)               320128    
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 32)                4128      
_________________________________________________________________
dropout_14 (Dropout)         (None, 32)               

In [56]:
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=512, validation_split=0.2, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f764502fb38>

In [57]:
model.evaluate(X_test, y_test)



[0.6025338768959045, 0.8011599779129028]

## Testiamo la Rete

In [62]:
reviews = ["This movie sucks, I just wasted two hours of my life", "Best movie I have ever seen, the ending was so touching and I made me crying so much.", "Not a bad movie"]

reviews = tokenizer.texts_to_sequences(reviews)
X = pad_sequences(reviews, maxlen = SEQ_MAX_LENGTH)

y = model.predict(X)
print(y)

[[0.1286717 ]
 [0.62813914]
 [0.20023899]]
