# Sentiment analysis of YELP reviews

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

## DATA PREPROCESSING

## Step 1: Stemming 

In [2]:
df = pd.read_csv('train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,label,text
0,1,Great mobile app with nice reward program. Mak...
1,2,Really fast and polite. Definitely recommend. ...
2,2,"This place is always amazing, friendly staff a..."
3,1,We did a Wine 101 class on a Friday night. Coo...
4,1,I am rounding up because I think this place ma...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400001 entries, 0 to 400000
Data columns (total 2 columns):
label    400001 non-null object
text     400000 non-null object
dtypes: object(2)
memory usage: 6.1+ MB


### Imports

In [15]:
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from nltk.corpus import stopwords
from unidecode import unidecode
from langdetect import detect
from nltk.stem import SnowballStemmer
from googletrans import Translator

In [16]:
stopwords = set(stopwords.words("english"))
exclude = set(string.punctuation)
translate = Translator()
sno = SnowballStemmer("english")

In [17]:
def get_words(sent):
    sent = sent.lower()
    temp = ''
    for ch in sent:
        if ch not in exclude:
            temp +=ch
        else:
            temp+=''
    arr = word_tokenize(temp)
    sent = ""
    for i in arr:
        if i!='' and i!=' ' and (i not in stopwords):
            sent +=i+' '
    return sent

In [18]:
inp = df.text.values
inp_processed = []
err = 0

In [None]:
for i in tqdm(range(len(inp))):
    #sent = unidecode(inp[i].decode('utf-8'))
    sent = inp[i]
    if detect(sent) != 'en':
        try:
            sent = translate.translate(sent).text
        except:
            err +=1
    sent = get_words(sent)
    stemmed = ''
    for j in sent.split(" "):
        stemmed+= sno.stem(j)+" "
    df.loc[i,'text'] = stemmed.encode('utf-8')

In [None]:
df.to_csv("stemmed_data.csv")

## Step 2: Tokenization

In [5]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import multiprocessing
cores = multiprocessing.cpu_count()
from sklearn import utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [6]:
df = pd.read_csv("stemmed_data.csv")

In [7]:
df.head()

Unnamed: 0,label,text
0,1,great mobil app nice reward program make reser...
1,2,realli fast polit definit recommend also clean...
2,2,place alway amaz friend staff great deal produ...
3,1,wine 101 class friday night cool spot downstai...
4,1,round think place may potenti coupl thing tri ...


In [8]:
x = df.text
y = df.label

In [36]:
from sklearn.cross_validation import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [37]:
print("Total Negative% Moderate% Positive%")
print(len(x_train),(len(x_train[y_train == 0]) / (len(x_train)))*100,(len(x_train[y_train == 1]) / (len(x_train)))*100,(len(x_train[y_train == 2]) / (len(x_train)))*100)
print(len(x_validation),(len(x_validation[y_validation == 0]) / (len(x_validation)))*100,(len(x_validation[y_validation == 1]) / (len(x_validation)))*100,(len(x_validation[y_validation == 2]) / (len(x_validation)*1.))*100)
print(len(x_test),(len(x_test[y_test == 0]) / (len(x_test)))*100,(len(x_test[y_test == 1]) / (len(x_test)))*100,(len(x_test[y_test == 2]) / (len(x_test))*100))

Total Negative% Moderate% Positive%
385613 20.435773690202353 16.961046437749765 62.60317987204789
3935 21.296060991105463 17.941550190597205 60.76238881829733
3935 21.092757306226176 16.16264294790343 62.74459974587039


In [None]:
x_train[0]

In [12]:
y_train[0]

array([1., 0., 0.], dtype=float32)

In [38]:
def labelize(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [39]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize(all_x, 'all')

In [67]:
model = Word2Vec(sg=1, size=300, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model.build_vocab([x.words for x in tqdm(all_x_w2v)])


  0%|          | 0/393483 [00:00<?, ?it/s][A
 67%|██████▋   | 262284/393483 [00:00<00:00, 2604474.63it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 2560979.40it/s][A

In [68]:
%%time
for epoch in range(10):
    model.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha


  0%|          | 0/393483 [00:00<?, ?it/s][A
 68%|██████▊   | 266290/393483 [00:00<00:00, 2648524.24it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 2633641.62it/s][A
  0%|          | 0/393483 [00:00<?, ?it/s][A
 68%|██████▊   | 267420/393483 [00:00<00:00, 2657815.76it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 2634486.63it/s][A
  0%|          | 0/393483 [00:00<?, ?it/s][A
 55%|█████▍    | 214727/393483 [00:00<00:00, 2135267.75it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 2067232.31it/s][A
  0%|          | 0/393483 [00:00<?, ?it/s][A
 30%|███       | 119836/393483 [00:00<00:00, 1188778.47it/s][A
 61%|██████    | 240091/393483 [00:00<00:00, 1193975.72it/s][A
 94%|█████████▍| 370581/393483 [00:00<00:00, 1232167.97it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 1242826.11it/s][A
  0%|          | 0/393483 [00:00<?, ?it/s][A
 35%|███▍      | 135776/393483 [00:00<00:00, 1347343.74it/s][A
 72%|███████▏  | 281627/393483 [00:00<00:00, 1402442.05it/s][A
1

CPU times: user 1h 9min 33s, sys: 5.59 s, total: 1h 9min 38s
Wall time: 9min 42s


In [69]:
model.save('w2v_model_sg.word2vec')

In [78]:
model = Word2Vec(sg=0, size=300, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model.build_vocab([x.words for x in tqdm(all_x_w2v)])


  0%|          | 0/393483 [00:00<?, ?it/s][A
 48%|████▊     | 187649/393483 [00:00<00:00, 1866448.22it/s][A
 89%|████████▉ | 350564/393483 [00:00<00:00, 1747185.59it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 1796826.92it/s][A

In [79]:
%%time
for epoch in range(10):
    model.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha


  0%|          | 0/393483 [00:00<?, ?it/s][A
 44%|████▎     | 172023/393483 [00:00<00:00, 1710160.60it/s][A
 91%|█████████ | 357596/393483 [00:00<00:00, 1782567.24it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 1822275.26it/s][A
  0%|          | 0/393483 [00:00<?, ?it/s][A
 32%|███▏      | 124338/393483 [00:00<00:00, 1230356.88it/s][A
 62%|██████▏   | 243338/393483 [00:00<00:00, 1211500.62it/s][A
 94%|█████████▍| 369796/393483 [00:00<00:00, 1228003.23it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 1217941.40it/s][A
  0%|          | 0/393483 [00:00<?, ?it/s][A
 30%|███       | 119772/393483 [00:00<00:00, 1187452.70it/s][A
 62%|██████▏   | 245873/393483 [00:00<00:00, 1224835.42it/s][A
 99%|█████████▉| 389763/393483 [00:00<00:00, 1295976.17it/s][A
100%|██████████| 393483/393483 [00:00<00:00, 1290395.39it/s][A
  0%|          | 0/393483 [00:00<?, ?it/s][A
 34%|███▎      | 132239/393483 [00:00<00:00, 1311785.76it/s][A
 63%|██████▎   | 249792/393483 [00:00<00:00, 12

CPU times: user 27min 18s, sys: 5.03 s, total: 27min 23s
Wall time: 4min 52s


In [80]:
model.save('w2v_model_cbow.word2vec') 

CBOW means continuous bag of words. SG means Skip Gram. 
With a corpus, CBOW model predicts the current word from a window of surrounding context words, while Skip-gram model predicts surrounding context words given the current word.


For example, let's say we have the following sentence: "I love dogs". CBOW model tries to predict the word "love" when given "I", "dogs" as inputs, on the other hand, Skip-gram model tries to predict "I", "dogs" when given the word "love" as input.

In [13]:
from  gensim.models import KeyedVectors
model_sg = KeyedVectors.load('w2v_model_sg.word2vec')
model_cbow = KeyedVectors.load('w2v_model_cbow.word2vec')

In [14]:
len(model_sg.wv.vocab.keys())

57291

In [15]:
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(x_train)

Tokenizing Train data

In [18]:
sequences = tokenizer.texts_to_sequences(x_train)
x_train_seq = pad_sequences(sequences, maxlen=80)
print('Shape of data tensor:', x_train_seq.shape)
partial_data = pd.DataFrame({"text_vectors":np.ndarray.tolist(x_train_seq),'label':y_train},dtype = 'object')
partial_data.to_pickle("train_tokanized.pkl")

Shape of data tensor: (385613, 80)


Tokenizing Validation data

In [19]:
sequences = tokenizer.texts_to_sequences(x_validation)
x_validation_seq = pad_sequences(sequences, maxlen=80)
print('Shape of data tensor:', x_validation_seq.shape)
partial_data = pd.DataFrame({"text_vectors":np.ndarray.tolist(x_validation_seq),'label':y_validation},dtype = 'object')
partial_data.to_pickle("validation_tokanized.pkl")

Shape of data tensor: (3935, 80)


Tokenizing Test data

In [20]:
sequences = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences, maxlen=80)
print('Shape of data tensor:', x_test_seq.shape)
partial_data = pd.DataFrame({"text_vectors":np.ndarray.tolist(x_test_seq),'label':y_test},dtype = 'object')
partial_data.to_pickle("test_tokanized.pkl")

Shape of data tensor: (3935, 80)


In [21]:
embeddings_index = {}
for w in model_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_cbow.wv[w],model_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))
num_words  = 30000
embedding_matrix = np.zeros((num_words, 600))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Found 57291 word vectors.


# Keras Model of Deep Neural Network

In [40]:
from keras.utils import to_categorical
y_train = to_categorical(y_train,num_classes = 3)
y_validation = to_categorical(y_validation,num_classes = 3)
y_test = to_categorical(y_test,num_classes = 3)

In [38]:
from sklearn.metrics import confusion_matrix,classification_report
seed = 7
import keras
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Flatten , Conv2D, MaxPooling2D, Reshape, ConvLSTM2D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [24]:
keras.backend.clear_session()
model = Sequential()
e = Embedding(30000, 600,  input_length=80, trainable=True) #weights=[embedding_matrix]
model.add(e)
model.add(Reshape((1,80,600),input_shape=(100000,)))
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu',))
model.add(MaxPooling2D(pool_size=(3, 3), padding='same',strides = 2))
model.add(Dropout(0.3))
model.add(Conv2D(filters=16, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(3, 3), padding='same',strides = 2))              
model.add(Dropout(0.3))
model.add(Flatten())
#model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 80, 600)           18000000  
_________________________________________________________________
reshape_1 (Reshape)          (None, 1, 80, 600)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 80, 16)         86416     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 40, 16)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 40, 16)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1, 40, 16)         2320      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 1, 20, 16)         0         
__________

In [26]:
from keras.callbacks import ModelCheckpoint
filepath="CNN_weights.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [27]:
model.fit(x_train_seq, y_train, validation_data=(x_validation_seq, y_validation), epochs=3, batch_size=8, callbacks = [checkpoint])

Train on 385613 samples, validate on 3935 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.85947, saving model to CNN_weights.h5
Epoch 2/3

Epoch 00002: val_acc improved from 0.85947 to 0.87598, saving model to CNN_weights.h5
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.87598


<keras.callbacks.History at 0x7f98c65e8ba8>

# Testing

In [28]:
from keras.models import load_model
loaded_CNN_model = load_model('CNN_weights.h5')

In [29]:
loaded_CNN_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 80, 600)           18000000  
_________________________________________________________________
reshape_1 (Reshape)          (None, 1, 80, 600)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 80, 16)         86416     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 40, 16)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 40, 16)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1, 40, 16)         2320      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 1, 20, 16)         0         
__________

In [31]:
#To plot model
from keras.utils.vis_utils import plot_model
plot_model(loaded_CNN_model, to_file='CNN_plot.png', show_shapes=True, show_layer_names=True)

In [32]:
loaded_CNN_model.evaluate(x=x_test_seq, y=y_test)



[0.3144913269346521, 0.8815756033608996]

In [33]:
y_predict = loaded_CNN_model.predict(x=x_test_seq)

In [42]:
confusion_matrix(np.argmax(y_test,axis = 1),np.argmax(y_predict,axis = 1))

array([[ 763,   37,   30],
       [ 107,  373,  156],
       [  47,   89, 2333]])

In [43]:
classification_report(np.argmax(y_test,axis = 1),np.argmax(y_predict,axis = 1))

'             precision    recall  f1-score   support\n\n          0       0.83      0.92      0.87       830\n          1       0.75      0.59      0.66       636\n          2       0.93      0.94      0.94      2469\n\navg / total       0.88      0.88      0.88      3935\n'