In [0]:
import spacy

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE


Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [0]:
import pandas as pd
import json
from tqdm import tqdm
path = "/content/drive/My Drive/Sample_trial/"
line_count = len(open(path + "review.json").readlines())
user_ids, business_ids, stars, dates, text = [], [], [], [], []
with open(path + "review.json", encoding="utf8") as f:
  for line in tqdm(f, total=line_count):
    blob = json.loads(line)
    user_ids += [blob["user_id"]]
    business_ids += [blob["business_id"]]
    stars += [blob["stars"]]
    dates += [blob["date"]]
    text += [blob["text"]]
ratings = pd.DataFrame({"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates, "text" : text})
user_counts = ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

active_users[:5]

100%|██████████| 6685900/6685900 [03:07<00:00, 35676.86it/s]


['CxDOIDnH8gp9KXzpBHJYXw',
 'bLbSNkLggFnqwNNzzq-Ijw',
 'PKEzKWv_FktMm2mGPjwd0Q',
 'ELcQDlf69kb-ihJfxZyL0A',
 'DK57YibC5ShBmqQl97CKog']

In [0]:
ratings_active_users = ratings[ratings["user_id"].isin(active_users)]

In [0]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_test = train_test_split(ratings_active_users, test_size=0.60, random_state=42)

In [0]:
ratings_train

Unnamed: 0,user_id,business_id,rating,date,text
2372436,PlPqz0Ve742oKSWAwj2uYw,UNI1agsPX2k3eJSJVB91nw,5.0,2017-04-01 02:29:17,My son and I decided to try this place for the...
4557308,5HmK6riLkciM0Xxo1ycNTg,fzxvmA3Ygu3yrQQ6-cEpTw,3.0,2017-04-29 20:14:13,"Overall, last night's dinner was not as good a..."
3178155,z6gseuVl0cR7tRLQa_DXuQ,TU95jEn8aGitY8hZowXaBg,5.0,2016-09-14 18:39:37,I was craving congee this evening so I stumble...
5118366,KvVV6iPB6I2aOiaXYKTtFg,gBfPyzPRmeOaj3SdcIj0Rw,4.0,2012-03-20 18:21:44,"Ok, when i heard this was the dinner venue for..."
1261400,uch01q0I89VjLtfHXR5r-A,b207RNqIAhTd4JsRsnXB1Q,5.0,2014-03-29 22:19:41,One of my favorite restaurants. We've been g...
...,...,...,...,...,...
1665285,Tr5iqhD4unIDpnPdugkdQQ,gZvwCOaMhxFXXNvy1q9_LA,4.0,2016-11-16 16:49:37,"toute l'intimité d'un japonais de qualité, dan..."
2304204,V8asoNKiCy23Px6d1idkLQ,C-u6Ywuq192icOblKm6oKg,4.0,2013-03-03 06:52:57,I am a pretty harsh critic when it comes to fo...
3289160,ri-yo9C3wVcnflXPfWsFjg,eaNenRk_liZBERFFLCXqqQ,5.0,2016-04-25 06:24:14,Great service!!! Our waitress NaNa is awesome!...
6337676,8cOea6vDRyKI73yF6ypSUA,7MNBIoGznDHhC1AfxGWOFw,1.0,2018-05-05 21:58:42,I have been trying to get internet set up at m...


In [0]:
type(ratings_active_users)

pandas.core.frame.DataFrame

In [0]:
ratings_train[["rating","text"]]

Unnamed: 0,rating,text
2372436,5.0,My son and I decided to try this place for the...
4557308,3.0,"Overall, last night's dinner was not as good a..."
3178155,5.0,I was craving congee this evening so I stumble...
5118366,4.0,"Ok, when i heard this was the dinner venue for..."
1261400,5.0,One of my favorite restaurants. We've been g...
...,...,...
1665285,4.0,"toute l'intimité d'un japonais de qualité, dan..."
2304204,4.0,I am a pretty harsh critic when it comes to fo...
3289160,5.0,Great service!!! Our waitress NaNa is awesome!...
6337676,1.0,I have been trying to get internet set up at m...


In [0]:
ratings_train.describe()

Unnamed: 0,rating
count,1815308.0
mean,3.748768
std,1.362879
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [0]:
labels = ratings_train['rating'].map(lambda x : 1 if int(x) > 3 else 0)

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [0]:
ratings_train['text'] = ratings_train['text'].map(lambda x: clean_text(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(ratings_train['text'])

sequences = tokenizer.texts_to_sequences(ratings_train['text'])
data = pad_sequences(sequences, maxlen=50)

In [0]:
print(data.shape)

(1815308, 50)


In [0]:
model_lstm = Sequential()
model_lstm.add(Embedding(20000, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
model_lstm.fit(data, np.array(labels), validation_split=0.4, epochs=3)




Train on 1089184 samples, validate on 726124 samples
Epoch 1/3





Epoch 2/3

In [0]:
def create_conv_model():

  model_conv = Sequential()
  model_conv.add(Embedding(vocabulary_size, 100, input_length=50))
  model_conv.add(Dropout(0.2))
  model_conv.add(Conv1D(64, 5, activation='relu'))
  model_conv.add(MaxPooling1D(pool_size=4))
  model_conv.add(LSTM(100))
  model_conv.add(Dense(1, activation='sigmoid'))
  model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model_conv

In [0]:
model_conv = create_conv_model()
model_conv.fit(data, np.array(labels), validation_split=0.4, epochs = 3)

Train on 1089184 samples, validate on 726124 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f82237aa438>

In [0]:
df_save = pd.DataFrame(data)
df_label = pd.DataFrame(np.array(labels))

In [0]:
result = pd.concat([df_save, df_label], axis = 1)

In [0]:
result.to_csv('train_dense_word_vectors.csv', index=False)

In [0]:
model_conv.save_weights('model_conv.h5')

# Save the model architecture
with open('model_conv_a.json', 'w') as f:
    f.write(model_conv.to_json())

In [0]:
model_lstm.save_weights('model_lstm.h5')

# Save the model architecture
with open('model_lstm_a.json', 'w') as f:
    f.write(model_lstm.to_json())

In [0]:
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 340021 word vectors.


In [0]:
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [0]:
model_glove = Sequential()
model_glove.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
model_glove.fit(data, np.array(labels), validation_split=0.4, epochs = 3)



Train on 1089184 samples, validate on 726124 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fdbded56860>

In [0]:
model_glove.save_weights('model_glove.h5')

# Save the model architecture
with open('model_glove_a.json', 'w') as f:
    f.write(model_glove.to_json())

In [0]:
lstm_embds = model_lstm.layers[0].get_weights()[0]
conv_embds = model_conv.layers[0].get_weights()[0]
glove_emds = model_glove.layers[0].get_weights()[0]