In [16]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Dropout, Dense, Input
from keras.layers.embeddings import Embedding
from keras.models import Model
import string
import re
import tensorflow as tf 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import contractions
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# In this notebook we show how to use GloVe embeddings to increase the performance of the model

In [17]:
df = pd.read_csv('IMDBDataset.csv')

In [18]:
# We make only two preprocess. Remove tags and punctuations

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
    
df['clean_review']= df['review'].apply(lambda cw : remove_tags(cw))
df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [19]:
review = df.review

sentiment = df['sentiment']

# Replcae "positive" with 1 and "negative" with 0
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, sentiment)))

# Split into train and test
X_train, X_test,Y_train, Y_test = train_test_split(review, y, test_size=0.2, random_state = 45)

In [20]:
# Tokenize the text and create a word index so we can the merge it with the embedding

tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)

words_to_index = tokenizer.word_index

In [21]:
# Create a function to read the glove vector
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

    return word_to_vec_map

In [22]:
word_to_vec_map = read_glove_vector('glove.6B.50d.txt')

maxLen = 150

In [23]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

# Create a mtrix where we would have each word embedding. And if there is no vector for a given word, keep it with 0's
emb_matrix = np.zeros((vocab_len, embed_vector_len))

not_found = []
for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector
    else:
        not_found.append(word)

# Create the word embedding we are going to use in the model, replace the weights for those we create above
embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [24]:
# Words in the vocabulary that weren't found
len(not_found)

41650

In [25]:
# Create the LSTM model with embedding layer

def imdb_rating(input_shape):

    X_indices = Input(input_shape)

    embeddings = embedding_layer(X_indices)

    X = LSTM(128, return_sequences=True)(embeddings)

    X = Dropout(0.6)(X)

    X = LSTM(128, return_sequences=True)(X)

    X = Dropout(0.6)(X)

    X = LSTM(128)(X)

    X = Dense(1, activation='sigmoid')(X)

    model = Model(inputs=X_indices, outputs=X)

    return model

In [26]:
# Replace the text into chunks of indexes already set
X_train_indices = tokenizer.texts_to_sequences(X_train)

# Pad sequences to 150
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [27]:
model = imdb_rating((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 150)]             0         
                                                                 
 embedding (Embedding)       (None, 150, 50)           5612100   
                                                                 
 lstm (LSTM)                 (None, 150, 128)          91648     
                                                                 
 dropout (Dropout)           (None, 150, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 150, 128)          131584    
                                                                 
 dropout_1 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584

In [28]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x22c8797aa60>

In [25]:
X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

model.evaluate(X_test_indices, Y_test)

# Create the same process and model, but this time use the preprocess we use in the other notebook

In [117]:
df = pd.read_csv('IMDBDataset.csv')

wordnet_lemmatizer = WordNetLemmatizer()

wordnet_lemmatizer.lemmatize('reviewers')

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(lemm_text)

a= lemmatizer(df['review'][0])

def expand_contraction(text):
    # creating an empty list
    expanded_words = []    
    for word in text.split():
      # using contractions.fix to expand the shotened words
      expanded_words.append(contractions.fix(word))   

    return ' '.join(expanded_words)

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

df['clean_review'] = df['review'].apply(expand_contraction)

df['clean_review'] = df['review'].apply(decontracted)

df['clean_review'] = df['clean_review'].apply(strip_html)

df['clean_review'] = df['clean_review'].str.lower()

In [154]:
review = df.clean_review

sentiment = df['sentiment']

y = np.array(list(map(lambda x: 1 if x=="positive" else 0, sentiment)))

X_train, X_test,Y_train, Y_test = train_test_split(review, y, test_size=0.2, random_state = 45)

In [155]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)

words_to_index = tokenizer.word_index

In [156]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

not_found = []
for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector
    else:
        not_found.append(word)

In [157]:
# Doing this preprocess we have less words not found!

len(not_found)

34405

In [158]:
embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [160]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [161]:
model = imdb_rating((maxLen,))

In [162]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x17ba611aa60>

In [163]:
X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

model.evaluate(X_test_indices, Y_test)



[0.37362515926361084, 0.8327000141143799]

In [None]:
# Save embedding so we can use it in tensor board projector.

In [9]:
found = []
word_found = []
for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        found.append(embedding_vector)
        word_found.append(word)

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

k = 0
for i in range(len(found)):
    if k != 0:
        out_m.write('\n')
        out_v.write('\n')
    out_m.write(word_found[i])
    out_v.write('\t'.join([str(x) for x in found[i]]))
    k += 1
out_v.close()
out_m.close() 