In [1]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
from keras.models import model_from_json
import numpy

Using TensorFlow backend.


# read dataset

In [2]:
data = pd.read_csv('dataset_food_online.txt' ,encoding="ISO-8859-1")
tok = WordPunctTokenizer()
def cleaner(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()
data['text']=data['text'].apply(lambda x: cleaner(x))
data = data.drop('date', 1)
data = data.drop('type', 1)
#data = data.drop('review_id', 1)
#data = data.drop('business_id', 1)
data.head(1)

Unnamed: 0,business_id,review_id,stars,text,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,fWKvX83p0-ka4JS3dc6E5A,5,my wife took me here on my birthday for breakf...,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0


# load LSTM model and embedding matrix

In [4]:
# Model reconstruction from JSON file
with open('model_architecture.json', 'r') as f:
    model = model_from_json(f.read())

# Load weights into the new model
model.load_weights('model_weights.h5')
print("LSTM loaded")

embeddings = numpy.load("embeddings.dat")
print(embeddings.shape)


LSTM loaded
(28626, 200)


# user input reveiw

In [5]:
rev=input("input a text for recommender:")
#-----------------------------------------
#nltk.download('punkt') # if necessary...
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]
rank_new1=[]
for index, row in data.iterrows():
    rank_new1.append(cosine_sim(row['text'], rev))

data['sim']=rank_new1
final_df = data.sort_values(by=['sim'], ascending=False)
final_df.head()

input a text for recommender:falafel


Unnamed: 0,business_id,review_id,stars,text,user_id,cool,useful,funny,sim
3155,doEZAj-NSnoEMOJl5yXcNw,C4lduID0NZMqzjQuIWlnbw,5,falafel is outstanding here dark exterior with...,joIzw_aUiNvBTuGoytrH7g,1,1,1,0.207513
9445,n_YHTTG0QIjZ3055wReefQ,mj1KE6poy83L-rxaWMGS7w,4,we stumbled upon this little place in a worn s...,N7lSh49zQ13sG3Jpa9E6cw,3,3,2,0.202823
2455,6oRAC4uyJCsJl1X0WZpVSA,XABPPSphqJ4PVrZbZ_AjtA,4,haji baba is unique by being both a restaurant...,kq5Pdsy8Znyh9KEkxWT_QA,0,0,0,0.199213
1684,Aicnm12Zped8nQFXyRRFvw,OHkVZtMNwwUoSThKHBzb3A,4,byblos has some of the best middle eastern foo...,nWouNfZD3Pw08RYizxkqcA,1,0,1,0.146753
5084,-ftQeUsqwDkExRg6IYrubQ,fEVIIMjwaFJmjuC4BboVzg,5,i am from chicago and the italian beef here re...,MGYI7Ip_hs6TIt4goS_mbA,0,0,0,0.143728


In [6]:
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split as sk_split
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(final_df['text'])
sequences=tokenizer.fit_on_texts(final_df['text'].values)
X = tokenizer.texts_to_sequences(final_df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
X = pad_sequences(X,maxlen=200)



Found 28625 unique tokens.


In [7]:
len(X)

10000

In [8]:
pred=model.predict(X)

In [9]:
final_df['newranke']=pred
final_df.head(3)


Unnamed: 0,business_id,review_id,stars,text,user_id,cool,useful,funny,sim,newranke
3155,doEZAj-NSnoEMOJl5yXcNw,C4lduID0NZMqzjQuIWlnbw,5,falafel is outstanding here dark exterior with...,joIzw_aUiNvBTuGoytrH7g,1,1,1,0.207513,4.331992
9445,n_YHTTG0QIjZ3055wReefQ,mj1KE6poy83L-rxaWMGS7w,4,we stumbled upon this little place in a worn s...,N7lSh49zQ13sG3Jpa9E6cw,3,3,2,0.202823,4.030197
2455,6oRAC4uyJCsJl1X0WZpVSA,XABPPSphqJ4PVrZbZ_AjtA,4,haji baba is unique by being both a restaurant...,kq5Pdsy8Znyh9KEkxWT_QA,0,0,0,0.199213,4.090116


In [17]:
df=final_df[:20]
df.head(5)

Unnamed: 0,business_id,review_id,stars,text,user_id,cool,useful,funny,sim,newranke
3155,doEZAj-NSnoEMOJl5yXcNw,C4lduID0NZMqzjQuIWlnbw,5,falafel is outstanding here dark exterior with...,joIzw_aUiNvBTuGoytrH7g,1,1,1,0.207513,4.331992
9445,n_YHTTG0QIjZ3055wReefQ,mj1KE6poy83L-rxaWMGS7w,4,we stumbled upon this little place in a worn s...,N7lSh49zQ13sG3Jpa9E6cw,3,3,2,0.202823,4.030197
2455,6oRAC4uyJCsJl1X0WZpVSA,XABPPSphqJ4PVrZbZ_AjtA,4,haji baba is unique by being both a restaurant...,kq5Pdsy8Znyh9KEkxWT_QA,0,0,0,0.199213,4.090116
1684,Aicnm12Zped8nQFXyRRFvw,OHkVZtMNwwUoSThKHBzb3A,4,byblos has some of the best middle eastern foo...,nWouNfZD3Pw08RYizxkqcA,1,0,1,0.146753,4.602813
5084,-ftQeUsqwDkExRg6IYrubQ,fEVIIMjwaFJmjuC4BboVzg,5,i am from chicago and the italian beef here re...,MGYI7Ip_hs6TIt4goS_mbA,0,0,0,0.143728,4.525226


In [18]:
df = df.sort_values(by=['newranke'], ascending=False)
df.head(5)

Unnamed: 0,business_id,review_id,stars,text,user_id,cool,useful,funny,sim,newranke
1684,Aicnm12Zped8nQFXyRRFvw,OHkVZtMNwwUoSThKHBzb3A,4,byblos has some of the best middle eastern foo...,nWouNfZD3Pw08RYizxkqcA,1,0,1,0.146753,4.602813
4134,doEZAj-NSnoEMOJl5yXcNw,BX_iL7m8Y6_J3nNFCbp_dw,4,stopped in last night for a take out dinner i ...,TTP7l5g4QUVCpQZ9NTAWqw,1,5,0,0.124318,4.587102
5084,-ftQeUsqwDkExRg6IYrubQ,fEVIIMjwaFJmjuC4BboVzg,5,i am from chicago and the italian beef here re...,MGYI7Ip_hs6TIt4goS_mbA,0,0,0,0.143728,4.525226
2453,fb9eLHJ4S--TyXsarJJo-g,wiOFzFx_gL9yF7Qn_TAG8Q,5,this place is still open but the name has been...,qYOtHoVCMkdmh73wvUohIA,0,1,0,0.072055,4.488928
3255,w19cemjVR8u02PgjFpJ7Mw,gS_cpr4aTV5nE-3iyKtnVQ,5,so i have to admit that i once threw up in the...,fhV21-QB6n402-J9vR-7cQ,2,3,1,0.066784,4.457037


In [19]:

df[:20]['business_id']

1684    Aicnm12Zped8nQFXyRRFvw
4134    doEZAj-NSnoEMOJl5yXcNw
5084    -ftQeUsqwDkExRg6IYrubQ
2453    fb9eLHJ4S--TyXsarJJo-g
3255    w19cemjVR8u02PgjFpJ7Mw
2244    Gw8DsQQCO5m1hggXqwynFg
3591    tyETqrYijm3cY4noCwl9Ww
6006    nRO4tRwimU12hg7Cnz__iA
3155    doEZAj-NSnoEMOJl5yXcNw
908     yktWUtKBja_Lzk3wwR6RFA
2455    6oRAC4uyJCsJl1X0WZpVSA
9445    n_YHTTG0QIjZ3055wReefQ
3014    6oRAC4uyJCsJl1X0WZpVSA
2135    qXQ3ZBdwI3GlbR5-eYWqNA
6653    8m08a9xJKmANwmeuR-0bPA
1123    w19cemjVR8u02PgjFpJ7Mw
2736    zonV2F6YNVn2_sI5dK82eg
5511    dn9sB0Kok8cnkDhpcl7YCg
2868    cOUS79i4vltKIc_hy4OZBg
9272    VVPVg9aJzNczTgeM36TGJw
Name: business_id, dtype: object