# Ejercicio 14

### Objetivo: Utilizando LSTM - Predecir el Rating de Peliculas a partir de resumen 


In [43]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
#from livelossplot import PlotLossesKeras
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salacaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\salacaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
dataTraining = pd.read_csv('https://github.com/albahnsen/AdvancedMethodsDataAnalysisClass/raw/master/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
#.reset_index(drop=True)

In [16]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"in sweden , a female blackmailer with a disfi...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"in a friday afternoon in new york , the presi...",['Drama'],7.4
2582,1990,Narrow Margin,"in los angeles , the editor of a publishing h...","['Action', 'Crime', 'Thriller']",6.6


In [17]:
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [18]:
y

3107    1
900     0
6724    1
4704    1
2582    1
       ..
8417    0
1592    0
1723    0
7605    1
215     1
Name: rating, Length: 7895, dtype: int32

In [19]:
plots

3107    most is the story of a single father who takes...
900     a serial killer decides to teach the secrets o...
6724    in sweden ,  a female blackmailer with a disfi...
4704    in a friday afternoon in new york ,  the presi...
2582    in los angeles ,  the editor of a publishing h...
                              ...                        
8417    " our marriage ,  their wedding .  "  it ' s l...
1592    the wandering barbarian ,  conan ,  alongside ...
1723    like a tale spun by scheherazade ,  kismet fol...
7605    mrs .  brisby ,  a widowed mouse ,  lives in a...
215     tinker bell journey far north of never land to...
Name: plot, Length: 7895, dtype: object

# Ejercicio 14.1

- Cambiar a minusculas (Limpieza de Datos)
- Separar el texto en palabras
- Eliminar StopWords 
- pad_sequences

#### Descripción de los Datos

In [20]:
dataTraining.shape

(7895, 5)

In [21]:
X=dataTraining['plot']
X.shape

(7895,)

#### Limpieza de Datos

Cambiar a minusculas, eliminar caractéres especiales, eliminar números y espacios.

In [22]:
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
#stopword list to use
eng_stopwords = stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
#english stemmer
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salacaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\salacaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
def clean_text(text):
    text = re.sub("\'", "", text)  
    text = re.sub("[^a-zA-Z]"," ",text) 
    text = text.lower()    
    return text
dataTraining['plot'] = dataTraining['plot'].apply(lambda x: clean_text(x))

In [24]:
from string import digits

def eliminate_numbers(texto):
    remove_digits = str.maketrans('', '', digits)
    texto = texto.translate(remove_digits)

    return texto

dataTraining['plot'] = dataTraining['plot'].apply(lambda x: eliminate_numbers(x))

In [25]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,most is the story of a single father who takes...,"['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,a serial killer decides to teach the secrets o...,"['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,in sweden a female blackmailer with a disfi...,"['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,in a friday afternoon in new york the presi...,['Drama'],7.4
2582,1990,Narrow Margin,in los angeles the editor of a publishing h...,"['Action', 'Crime', 'Thriller']",6.6


#### Tokenización: Separar el texto en palabras

In [26]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

dataTraining['plot'] = dataTraining['plot'].apply(lambda x: tokenization(x.lower()))

In [27]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,"[most, is, the, story, of, a, single, father, ...","['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,"[a, serial, killer, decides, to, teach, the, s...","['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"[in, sweden, a, female, blackmailer, with, a, ...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"[in, a, friday, afternoon, in, new, york, the,...",['Drama'],7.4
2582,1990,Narrow Margin,"[in, los, angeles, the, editor, of, a, publish...","['Action', 'Crime', 'Thriller']",6.6


#### Quitar Stop Words

In [28]:
def remove_stopwords(text):
    text = [word for word in text if word not in eng_stopwords]
    return text
    
dataTraining['plot'] = dataTraining['plot'].apply(lambda x: remove_stopwords(x))

#### Stemming
Extraer el stem/raiz de las palabras para reducir el número de features y mejorar el poder de predicción

In [29]:
def stemming(text):
    text = [stemmer.stem(word) for word in text]
    return text

dataTraining['plot'] = dataTraining['plot'].apply(lambda x: stemming(x))

In [30]:
dataTraining.head()

Unnamed: 0,year,title,plot,genres,rating
3107,2003,Most,"[stori, singl, father, take, eight, year, old,...","['Short', 'Drama']",8.0
900,2008,How to Be a Serial Killer,"[serial, killer, decid, teach, secret, satisfi...","['Comedy', 'Crime', 'Horror']",5.6
6724,1941,A Woman's Face,"[sweden, femal, blackmail, disfigur, facial, s...","['Drama', 'Film-Noir', 'Thriller']",7.2
4704,1954,Executive Suite,"[friday, afternoon, new, york, presid, tredway...",['Drama'],7.4
2582,1990,Narrow Margin,"[los, angel, editor, publish, hous, carol, hun...","['Action', 'Crime', 'Thriller']",6.6


#### Pad Sequences
Utilizando Keras, para generar vectores de la misma longitud.

In [31]:
from keras.preprocessing.sequence import pad_sequences

In [32]:
datat_string=[" ".join(plot) for plot in dataTraining['plot'].values]

In [44]:
#datat_string

In [34]:
voc = set(''.join(datat_string))
vocabulary = {x: idx + 1 for idx, x in enumerate(set(voc))}

In [35]:
vocabulary

{'o': 1,
 'g': 2,
 'q': 3,
 'l': 4,
 'p': 5,
 'r': 6,
 'a': 7,
 'c': 8,
 'w': 9,
 'm': 10,
 's': 11,
 'x': 12,
 'e': 13,
 ' ': 14,
 'y': 15,
 'b': 16,
 'z': 17,
 'n': 18,
 'i': 19,
 'k': 20,
 'j': 21,
 't': 22,
 'd': 23,
 'h': 24,
 'u': 25,
 'f': 26,
 'v': 27}

In [36]:
max_len = 1100
X1 = [x[:max_len] for x in datat_string]

In [37]:
# Convert characters to int and pad
X1 = [[vocabulary[x1] for x1 in x if x1 in vocabulary.keys()] for x in X1]

In [38]:
plot_seq = pad_sequences(X1, dtype=object)

In [39]:
training_pad = pd. DataFrame(plot_seq)
training_pad

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099
0,0,0,0,0,0,0,0,0,0,0,...,25,4,22,25,14,26,19,4,10,14
1,0,0,0,0,0,0,0,0,0,0,...,1,6,13,14,8,4,13,6,20,14
2,0,0,0,0,0,0,0,0,0,0,...,4,23,14,18,13,5,24,13,9,14
3,26,6,19,23,7,15,14,7,26,22,...,14,24,13,19,6,14,22,6,13,23
4,0,0,0,0,0,0,0,0,0,0,...,6,7,15,14,1,26,26,19,8,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7890,0,0,0,0,0,0,0,0,0,0,...,19,6,14,26,7,10,19,4,19,14
7891,0,0,0,0,0,0,0,0,0,0,...,1,23,14,23,7,2,1,22,24,14
7892,0,0,0,0,0,0,0,0,0,0,...,22,7,20,13,14,24,7,18,23,14
7893,0,0,0,0,0,0,0,0,0,0,...,13,10,25,11,14,10,1,27,13,14


# Ejercicio 14.2

Crear una red neural LSTM para predecir el rating de la pelicula

Calcular el accuracy en el testing set.

In [40]:
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [42]:
vect = CountVectorizer(stop_words='english',ngram_range=(1, 5),min_df=2,max_features=1500 )
X_dtm = vect.fit_transform(datat_string)
X_dtm.shape

(7895, 1500)

In [252]:
X_train, X_test, y_train, y_test = train_test_split(X_dtm, y, test_size=0.30, random_state=42)

In [253]:
dims = X_train.shape[1]

In [274]:
len_vocabulary=(np.amax(X_dtm))

In [275]:
y_train

10685    1
10623    1
2259     1
4153     0
1867     1
        ..
2897     1
9978     1
6489     0
6534     0
9702     1
Name: rating, Length: 5526, dtype: int32

In [255]:
dims

1500

In [276]:
model = Sequential()
model.add(Embedding(len_vocabulary+1, 100, input_length=dims))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.summary()  

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 1500, 100)         2900      
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 101       
Total params: 83,401
Trainable params: 83,401
Non-trainable params: 0
_________________________________________________________________


In [277]:
X_train

<5526x1500 sparse matrix of type '<class 'numpy.int64'>'
	with 163041 stored elements in Compressed Sparse Row format>

In [278]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Train on 5526 samples, validate on 2369 samples
Epoch 1/3



Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1a19c73a370>

In [279]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 53.14%


##### Conclusiones

-En una red neuronal la selección de parámetros y el número de capas tienen un efecto representativo sobre el accuracy del modelo. En un escenario de mayor rigor es necesario iterar calibrar estos parámetros para encontrar aquellos que optimicen el modelo.

-El accuracy obtenido mediante con el vector generado por CountVectorizer es 53.14%. Sospechabamos que era por el método de obtención del vector pero probamos con TF-IDF y el resulado fue muy similar. Esto nos muestra la necesidad de explotar nuevas alternativas para generar el vector de embeddings, por ejemplo algunos explicados en la literatura como Word2vec, GloVe, ELMo y BERT.
Adicionalmente, nuevamente evidenciamos la importancia de la calibración del modelo.