# Practica 4, LSTM-GRU: CONTINUA

# Sentiment Analysis (LSTM vs. GRU)

## Preparación de ambiente

### Carga de módulos

In [1]:
# Data Wrnagling
import re
import numpy as np
import pandas as pd

# Data visualization
from PIL import Image
import cufflinks as cf
import matplotlib.pyplot as plt
from wordcloud import ImageColorGenerator
from wordcloud import WordCloud, STOPWORDS

# Modeling
from keras import metrics
from keras.layers import GRU
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Preprocessing
import unicodedata
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Model performance
#from sklearn.metrics import accuracy_score, confusion_matrix
from keras.metrics import RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError

from sklearn.model_selection import train_test_split, cross_val_score

cf.go_offline()

### Funciones relevantes

In [2]:
def freq_discrete(df, feat_disc=[]):
    for var in feat_disc:
        print(f"\nFeature: {var}\n")
        aux = df[var].fillna("Missing").value_counts().to_frame()
        aux[f"{var}_cumsum"] = aux[var].cumsum()
        aux["perc"] = aux[var] / len(df)
        aux["cumsum"] = aux["perc"].cumsum()
        aux[var] = aux[var].apply(lambda x: "{:,.0f}".format(x))
        aux[f"{var}_cumsum"] = aux[f"{var}_cumsum"].apply(
            lambda x: "{:,.0f}".format(x))
        aux["perc"] = aux["perc"].apply(lambda x: "{:,.2%}".format(x))
        aux["cumsum"] = aux["cumsum"].apply(lambda x: "{:,.2%}".format(x))
        aux.columns = ["Frequency", "Accumulated frequency",
                       "Percentage", "Accumulated percentage"]
        display(aux)

In [3]:
def clean_text(text, pattern="[^a-zA-Z0-9 ]"):
    cleaned_text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern, " ", cleaned_text.decode("utf-8"), flags=re.UNICODE)
    cleaned_text = u' '.join(cleaned_text.lower().split())
    return cleaned_text

In [4]:
def get_wordcloud(text, mask_path, font_path):
    mask = np.array(Image.open(mask_path))
    mask_colors = ImageColorGenerator(mask)
    wc = WordCloud(stopwords=STOPWORDS, font_path=font_path, 
                   background_color="white", max_words=2000,
                   max_font_size=100, random_state=42,
                   width=mask.shape[1], height=mask.shape[0], color_func=mask_colors)
    wc.generate(text)
    plt.figure(figsize=(50, 10))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis('off')
    plt.show()

## Data Wrangling

### Carga de datos

In [5]:
df = pd.read_csv("amazon_train.csv", error_bad_lines=False)

In [6]:
df.head()

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes,review_id
0,B07Q6ZNJNT,Robert Baratheon,5,"May 30, 2019",True,Hermoso diseño y muy rapido,Me llego hoy y es justo lonque esperaba de ver...,18.0,14945
1,B07C6FCC8G,McMan,5,"July 15, 2018",False,Prefer it to my iPhone x..,Best phone I've ever had. My iPhone x is sitti...,,8989
2,B07HD3QC65,JASMINE WIN,3,"February 21, 2019",True,just ok,I dont like this phone. this phone is not too ...,,50888
3,B07P8MQHSH,Hermione,1,"December 13, 2019",True,Perfect to BRICK in 2 days flat - light usage,"I bought this phone NEW from Amazon, set it up...",2.0,12734
4,B014GCG150,Monique,3,"December 20, 2016",True,SIM CARD TROUBLE,THE SIM CARD I AM USING IS TOO BIG GOING TO HA...,,20622


### EDA

In [7]:
df = df[['review_id','body','rating']]

In [8]:
df.head()

Unnamed: 0,review_id,body,rating
0,14945,Me llego hoy y es justo lonque esperaba de ver...,5
1,8989,Best phone I've ever had. My iPhone x is sitti...,5
2,50888,I dont like this phone. this phone is not too ...,3
3,12734,"I bought this phone NEW from Amazon, set it up...",1
4,20622,THE SIM CARD I AM USING IS TOO BIG GOING TO HA...,3


In [9]:
df['rating'].value_counts(1)

5    0.557434
1    0.187354
4    0.127577
3    0.070035
2    0.057601
Name: rating, dtype: float64

In [10]:
freq_discrete(df, ["rating"])


Feature: rating



Unnamed: 0,Frequency,Accumulated frequency,Percentage,Accumulated percentage
5,28423,28423,55.74%,55.74%
1,9553,37976,18.74%,74.48%
4,6505,44481,12.76%,87.24%
3,3571,48052,7.00%,94.24%
2,2937,50989,5.76%,100.00%


In [11]:
df["rating"].value_counts(dropna=False).iplot(kind="bar")

### Análisis de texto

In [12]:
df['body'] = df['body'].astype(str)

In [14]:
get_wordcloud(text = " ".join(df["body"].sample(frac = 0.1)), mask_path="movie-icon-png-2.jpg", font_path="Roboto-Light.ttf")

TypeError: get_wordcloud() missing 2 required positional arguments: 'mask_path' and 'font_path'

In [15]:
index = np.random.choice(df.index)

In [16]:
index

37957

In [17]:
df.loc[index, "body"]

"So I was about to buy the Pixel 4 but the specs on this phone match and the battery is actually bigger. I'm apparently one of the few people who still prefer a smaller phone so I didn't want the S10 or S10+ so this e model was perfect. I'm also really glad I found the Amazon refurbished model as it saved an extra couple hundred bucks. Also, I am nerding out on all the hole punch wallpapers that are out there, they are pretty sweet!"

### Limpieza de texto

In [18]:
bs = BeautifulSoup(df.loc[index, "body"])

In [19]:
bs.prettify()

"<html>\n <body>\n  <p>\n   So I was about to buy the Pixel 4 but the specs on this phone match and the battery is actually bigger. I'm apparently one of the few people who still prefer a smaller phone so I didn't want the S10 or S10+ so this e model was perfect. I'm also really glad I found the Amazon refurbished model as it saved an extra couple hundred bucks. Also, I am nerding out on all the hole punch wallpapers that are out there, they are pretty sweet!\n  </p>\n </body>\n</html>"

In [20]:
BeautifulSoup(bs.prettify()).text

"\n\n\n   So I was about to buy the Pixel 4 but the specs on this phone match and the battery is actually bigger. I'm apparently one of the few people who still prefer a smaller phone so I didn't want the S10 or S10+ so this e model was perfect. I'm also really glad I found the Amazon refurbished model as it saved an extra couple hundred bucks. Also, I am nerding out on all the hole punch wallpapers that are out there, they are pretty sweet!\n  \n\n"

In [21]:
df["body"] = df["body"].map(lambda x: BeautifulSoup(BeautifulSoup(x).prettify()).text)


"https://www.amazon.com/gp/product/B01M6749UP/ref=oh_aui_detailpage_o01_s00?ie=UTF8&psc=1" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


"https://www.amazon.com/dp/B01CJU9BBM/ref=cm_cr_ryp_prd_ttl_sol_0" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.



In [22]:
df["body"] = df["body"].map(clean_text)

In [23]:
get_wordcloud(text = " ".join(df["body"].sample(frac = 0.1)), mask_path="./movie-icon-png-2.jpg", font_path="./Roboto-Light.ttf")

FileNotFoundError: [Errno 2] No such file or directory: './movie-icon-png-2.jpg'

In [25]:
aux = pd.DataFrame(df['body'].value_counts())

In [26]:
aux.head(50)

Unnamed: 0,body
good,329
great phone,287
love it,194
great,162
excellent,161
excelente,158
very good,142
perfect,108
good phone,88
works great,83


### Limpieza de texto

In [27]:
stop_words = [clean_text(x) for x in STOPWORDS.union("will")]

In [28]:
df["body"] = df["body"].map(lambda sentence: " ".join([word for word in sentence.split() if word not in stop_words]))

In [29]:
df["body"].str.split().map(lambda lista: [x for x in lista if x not in stop_words]).str.join(" ")

0        llego hoy y es justo lonque esperaba de verdad...
1        best phone ve iphone x sitting drawer everythi...
2        dont phone phone havey screen big enough batte...
3        bought phone new amazon set used 2 days charge...
4        sim card using big going go phone dealer try r...
                               ...                        
50984    started motog4 lasted year needed new phone go...
50985               phone listed stolen turn return seller
50986    exactly advertised s obvious s refurbed great ...
50987                                     works advertised
50988    ordered use jamaica works great everything pac...
Name: body, Length: 50989, dtype: object

In [30]:
get_wordcloud(text = " ".join(df["body"].sample(frac = 0.1)), mask_path="./movie-icon-png-2.jpg", font_path="./Roboto-Light.ttf")

FileNotFoundError: [Errno 2] No such file or directory: './movie-icon-png-2.jpg'

### Vectorización

In [31]:
df["body"].str.split().str.len().iplot(kind="hist")

In [32]:
df["body"].str.split().str.len().describe([x/10 for x in range(10)])

count    50989.000000
mean        29.425562
std         63.625710
min          0.000000
0%           0.000000
10%          2.000000
20%          4.000000
30%          6.000000
40%          9.000000
50%         12.000000
60%         17.000000
70%         24.000000
80%         37.000000
90%         65.000000
max       3161.000000
Name: body, dtype: float64

In [33]:
MAX_NB_WORDS = 50989
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 100

In [34]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['body'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 30782 unique tokens.


In [35]:
X = tokenizer.texts_to_sequences(df['body'].values)

### Padding

In [36]:
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (50989, 200)


In [37]:
X[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [38]:
X[-1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Preparación de la target

In [39]:
y = df["rating"].astype(float)

In [40]:
y

0        5.0
1        5.0
2        3.0
3        1.0
4        3.0
        ... 
50984    1.0
50985    1.0
50986    5.0
50987    4.0
50988    5.0
Name: rating, Length: 50989, dtype: float64

### Train-test split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.3, shuffle=True,stratify = y)

## Modelado

### LSTM

#### Arquitectura del modelo

In [86]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, activation="tanh"))
model.add(Dense(100, activation="relu"))
model.add(Dense(1, activation='linear'))

## METRICAS

In [87]:
kmetrics = [
    RootMeanSquaredError(name='rms'),
    MeanAbsoluteError(name='mae'),
    MeanAbsolutePercentageError(name='mape'),
] 

In [88]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=kmetrics)

In [89]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 100)          5098900   
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 200, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_6 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 5,189,501
Trainable params: 5,189,501
Non-trainable params: 0
_________________________________________________________________


#### Callbacks

In [90]:
early_stopping = EarlyStopping(monitor='loss', patience=15, min_delta=0.0001)

In [91]:
checkpoint = ModelCheckpoint('models/LSTM_regre/model_{mape:.3f}.h5',
                             save_best_only=True,
                             save_weights_only=False,
                             monitor='mape')

#### Entrenamiento

In [92]:
history = model.fit(X_train, y_train, epochs=10, batch_size=1024, callbacks=[early_stopping, checkpoint], validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [93]:
loss, rms, mae, mape = model.evaluate(X_test, y_test)



In [94]:
dc_history = history.history
dc_history.keys()

dict_keys(['loss', 'rms', 'mae', 'mape', 'val_loss', 'val_rms', 'val_mae', 'val_mape'])

In [95]:
results = pd.DataFrame(data = zip(history.history["loss"], history.history["val_loss"], 
                                  history.history["mape"], history.history["val_mape"], 
                                  history.history["rms"], history.history["val_rms"], 
                                  history.history["mae"], history.history["val_mae"]), 
                       columns=["loss", "val_loss", "mape", "val_mape",'rms', 'rms_val','mae', 'val_mae',])

In [96]:
results.iplot()

### GRU

In [97]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2, activation="tanh"))
model.add(Dense(100, activation="relu"))
model.add(Dense(1, activation='linear'))

In [98]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=kmetrics)

In [99]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 100)          5098900   
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 200, 100)          0         
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               60600     
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 5,169,701
Trainable params: 5,169,701
Non-trainable params: 0
_________________________________________________________________


In [100]:
checkpoint = ModelCheckpoint('models/GRU_regre/model_{mape:.3f}.h5',
                             save_best_only=True,
                             save_weights_only=False,
                             monitor='mape')

In [101]:
history = model.fit(X_train, y_train, epochs=10, batch_size=1024, callbacks=[early_stopping, checkpoint], validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [102]:
loss, rms, mae, mape = model.evaluate(X_test, y_test)



In [103]:
results = pd.DataFrame(data = zip(history.history["loss"], history.history["val_loss"], 
                                  history.history["mape"], history.history["val_mape"], 
                                  history.history["rms"], history.history["val_rms"], 
                                  history.history["mae"], history.history["val_mae"]), 
                       columns=["loss", "val_loss", "mape", "val_mape",'rms', 'rms_val','mae', 'val_mae',])


In [104]:
results.iplot()