In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

max_features = 20000  # Utilizamos las primeras 20.000 palabras para generar el diccionario
maxlen = 200  # utilizamos los primeros 200 palabras por review para hacer la clasificación


In [2]:
# Bajamos los datos para hacer la clasificación
!curl -O http://srodriguez.me/Datasets/imdb.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 25.7M  100 25.7M    0     0  7590k      0  0:00:03  0:00:03 --:--:-- 7590k


In [3]:
#Descomprimimos
!unzip imdb.zip

Archive:  imdb.zip
  inflating: IMDB Dataset.csv        


In [4]:
#Leemos los datos
df = pd.read_csv("IMDB Dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
#Vemos los primero 5 elementos
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
# Utilizamos una muestra de los datos, en vez de los 50.000 reviews solo vamos a utilizar 10.000
df = df.sample(10000)
df.reset_index(drop=True,inplace=True) #Reiniciamos los indidices del dataframe, para fines practicos de entrenamiento

In [7]:
#Cambiamos las etiquetas a valores numericos para que la redes puedan hacer el proceso de aprendizaje
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [8]:
#Vemos la primera review
df.review[0]

"Usually when a movie receives a vote of one it is because someone simply dislikes it and is annoyed it doesn't have a lower rating, and so decides to drag it down as much as they can instead of just giving it a low rating. This is not the case here.<br /><br />Bonesetter is a perfect example of a 0/10 film. It does nothing right and it doesn't have the chance to because it doesn't really attempt to do anything. There are strands of a bad D&D novel kind of plot which doesn't hold together and a complete lack of any kind of acting throughout. It is clear that nobody involved in this project gave it any kind of serious effort, because even a completely patently untalented persons' hard work would amount to more. A truly awful film."

In [9]:
#Utilizamos esto para poder separar nuestro df muestreado, en conjuntos de entrenamiento, validacion y testing
from sklearn.model_selection import train_test_split

In [10]:
#Separamos un 30% de datos para test
temp_df, test_df = train_test_split(df,test_size=0.3,random_state=42)
#y de training, sacamos un 10% para validación
train_df, val_df = train_test_split(temp_df,test_size=0.1,random_state=42)

train_df.reset_index(drop=True,inplace=True)
val_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [11]:
#Generamos nuestro tokenizador, el cual nos va a permitir generar nuestro diccionario
tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_features, oov_token='<unk>')

In [12]:
#Construimos el vocabulario
tokenizer.fit_on_texts(train_df['review'])

In [14]:
#Comprobamos la tercera palabra en el diccionario
tokenizer.index_word[2]

'the'

In [15]:
# Buscamos el indice de la palabra "the"
tokenizer.word_index['the']

2

In [16]:
#Transformamos el texto a secuencias numericas
secuencias = tokenizer.texts_to_sequences(train_df['review'])


In [17]:
#Hacemos padding de las secuencias (dejamos todas de un mismo largo fijo, con el token 0 == '<pad>')
secuencias_padded = keras.preprocessing.sequence.pad_sequences(secuencias,maxlen= maxlen)
secuencias_padded

array([[  517,   123,    70, ...,     1,     7,   780],
       [ 2250,     5,    22, ...,   475,    67,   133],
       [ 4433,    90,   200, ...,     1,    18,    23],
       ...,
       [   58,    16,   148, ...,  1284,    45,     5],
       [  180,  1303,    19, ...,   318, 12823,  9333],
       [   13,     7,     2, ...,   140,  8548,   157]], dtype=int32)

In [18]:
#Solo por sanidad mental, vemos cuantos va a ser el maximo del largo de nuestra red
maxlen

200

In [19]:
# Transformamos el texto a secuencia para los conjuntos de validacion y testing
val_seq = tokenizer.texts_to_sequences(val_df['review'])
val_seq_padded = keras.preprocessing.sequence.pad_sequences(val_seq,maxlen= maxlen)

test_seq = tokenizer.texts_to_sequences(test_df['review'])
test_seq_padded = keras.preprocessing.sequence.pad_sequences(test_seq,maxlen= maxlen)

In [20]:
#Generamos nuestras entradas para la red
X_train = secuencias_padded
y_train =train_df['sentiment']

X_val = val_seq_padded
y_val = val_df['sentiment']

X_test = test_seq_padded
y_test = test_df['sentiment']

In [22]:
from tensorflow.keras import layers
from tensorflow import keras
embedding_dim = 128

inputs = layers.Input(shape=(maxlen,), dtype="int64")

# Los embeddings de palabras, en este caso los tendran que aprender la Red. 
# 'embedding_dim'.
x = layers.Embedding(max_features + 1, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(embedding_dim, 2, padding="valid", activation="relu", strides=2)(x)
x = layers.Conv1D(embedding_dim, 2, padding="valid", activation="relu", strides=2)(x)
x = layers.GlobalMaxPooling1D()(x)

# Añadimos una capa intermedia para que procese las convoluciones + pooling
x = layers.Dense(embedding_dim, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# Nuestra neurona de salida, como es clasificación vinaria, utilizamos la activación sigmoidea
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

# Compilamos el modelo, definimos la funcion de perdida, el optimizador y las métricas
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 200, 128)          2560128   
                                                                 
 dropout_2 (Dropout)         (None, 200, 128)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, 100, 128)          32896     
                                                                 
 conv1d_3 (Conv1D)           (None, 50, 128)           32896     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                           

# Entrenamiento

In [23]:
epochs = 5 # Definimos la cantidad de iteraciónes que nuestro algoritmo va a entrenarse

# Ajustamos nuestro modelo utilizando el set de datos de entrenamiento, y vamos evaluando el rendimiento con el set de datos de validación
model.fit(X_train,y_train, batch_size=32, validation_data=(X_val,y_val), epochs=epochs)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3d429b0390>

In [24]:
# Computamos la perdida y la metrica de exactitud
model.evaluate(X_test,y_test)



[0.5107738971710205, 0.8336666822433472]

In [25]:
# Realizamos la predicción
y_pred = model.predict(X_test,batch_size=16,verbose=1)
y_pred = y_pred >= 0.5 # Transformamos las probabilidades en valores booleanos
y_pred = y_pred.astype(np.int8).reshape(-1) # Transformamos los vaolores booleanos, a enteros



In [26]:
from sklearn import metrics

In [27]:
#Obtenemos todas las métricas relacionadas
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8336666666666667
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      1461
           1       0.86      0.80      0.83      1539

    accuracy                           0.83      3000
   macro avg       0.83      0.83      0.83      3000
weighted avg       0.84      0.83      0.83      3000

