<a href="https://colab.research.google.com/github/swypar/PROJECTS/blob/main/Restaurant_reviews_and_ratings_from_yelp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Scraping at 400 full reviews and ratings from Yelp for a restaurant that has mixed reviews.
Cleaning and pre-processing the data. 
Developing a Word2Vec model using the reviews. (
Identifying words that are most similar to 3 items on the restaurant menu using the Word2Vec model. 


In [None]:
#installing package

import pandas as pd

In [None]:
#uploading restaurant file

from google.colab import files
uploaded = files.upload()

import io
reviews_df = pd.read_csv(io.BytesIO(uploaded['the-plank-pizza-co-beer-parlor.csv']))

Saving the-plank-pizza-co-beer-parlor.csv to the-plank-pizza-co-beer-parlor.csv


In [None]:
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,review_date,user_name,rating,review_text
0,0,12/3/2020,Angie Y.,,"['Great beer options, freshly made flatbread s..."
1,1,9/6/2020,Jason M.,,['This place is my favorite local spot! \xa0I ...
2,2,12/3/2020,Debra R.,,"[""Plank Pizza is serving take out meals in an ..."
3,3,3/2/2021,Arianny M.,,"['Excellent!!!!!', 'The food is amazing and th..."
4,4,3/3/2021,Ari L.,,"[""Me and gf came here this past Saturday night..."


In [None]:
reviews_df.count()

Unnamed: 0     479
review_date    479
user_name      479
rating           0
review_text    479
dtype: int64

In [None]:
reviews_df = reviews_df.drop('Unnamed: 0', axis=1)

In [None]:
#stopwords

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer

#Data preprocessing


In [None]:
def tokenize_clean_text(text):
  lemmatizer = nltk.stem.WordNetLemmatizer()
  clean_text = nltk.word_tokenize(text)
  clean_text = map(lambda x:x.lower(), clean_text)
  clean_text = list(clean_text)
  clean_text = [token for token in clean_text if token not in stopwords]
  symbols_to_remove = "``~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
  clean_text = [token for token in clean_text if token not in symbols_to_remove]
  clean_text = [lemmatizer.lemmatize(token) for token in clean_text]
  clean_text = str(' ').join(clean_text)
  return clean_text

reviews_df['clean_review'] = reviews_df.apply(lambda row: tokenize_clean_text(row['review_text']), axis=1)

In [None]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

def preprocess_text(text):
    tl = tokenizer.tokenize(text)
    return tl

tokenized_reviews = []


for clean_review in reviews_df:
    tokenized_reviews.append(preprocess_text(reviews_df['clean_review'][0]))

In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(tokenized_reviews, size=100, window=5, min_count=1, workers=4)

In [None]:
w2v_model.wv.vocab

{"'": <gensim.models.keyedvectors.Vocab at 0x7fcc9de3d0d0>,
 "''": <gensim.models.keyedvectors.Vocab at 0x7fcc9de3d410>,
 '3': <gensim.models.keyedvectors.Vocab at 0x7fcc9de3d910>,
 '4': <gensim.models.keyedvectors.Vocab at 0x7fcc9de42150>,
 '40': <gensim.models.keyedvectors.Vocab at 0x7fcc9de3ddd0>,
 "\\'": <gensim.models.keyedvectors.Vocab at 0x7fcc9de3d590>,
 'actually': <gensim.models.keyedvectors.Vocab at 0x7fcc9de3ded0>,
 'addition': <gensim.models.keyedvectors.Vocab at 0x7fcc9de42390>,
 'adhering': <gensim.models.keyedvectors.Vocab at 0x7fcc9de3d810>,
 'ale': <gensim.models.keyedvectors.Vocab at 0x7fcc9de42350>,
 'amicable': <gensim.models.keyedvectors.Vocab at 0x7fcc9de3db50>,
 'apart': <gensim.models.keyedvectors.Vocab at 0x7fcc9de42050>,
 'appetizer': <gensim.models.keyedvectors.Vocab at 0x7fcc9de42490>,
 'area': <gensim.models.keyedvectors.Vocab at 0x7fcc9de3da50>,
 'around': <gensim.models.keyedvectors.Vocab at 0x7fcc9de3dd90>,
 'artichoke': <gensim.models.keyedvectors.Voca

comparing similarities

In [None]:
w2v_model.most_similar(positive=['style','menu'], topn=5)

  """Entry point for launching an IPython kernel.


[('eggrolls', 0.2519930303096771),
 ('ipas', 0.1793195754289627),
 ('piping', 0.17544490098953247),
 ('guideline', 0.17130887508392334),
 ('food', 0.16490887105464935)]

In [None]:
w2v_model.most_similar(positive=['appetizer', 'wonderful'], topn=5)

  """Entry point for launching an IPython kernel.


[('addition', 0.3175678849220276),
 ('given', 0.2955036461353302),
 ('outdoors', 0.20105016231536865),
 ('option', 0.19858446717262268),
 ('party', 0.19304102659225464)]

In [None]:
w2v_model.most_similar(positive=['pizza'], topn=5)

  """Entry point for launching an IPython kernel.


[('beer', 0.3155582547187805),
 ('hot', 0.30840301513671875),
 ('awesome', 0.24650268256664276),
 ('staff', 0.2219543308019638),
 ('we', 0.18778353929519653)]