In [2]:
!pip install transformers requests beautifulsoup4 pandas numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 28.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 49.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 10.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification      #Tokenizer- string into a sequence of number, architecture of the transformers that loads in our NLP model
import torch                                                                    #argmax function
import requests                                                                 # Web scraping
from bs4 import BeautifulSoup                                                   #extract data from the web page
import re                                                                       # extract specific commens that we want

#Model Instantiation
#1. loading a pre-trained NLP model (from hugging face)
#2. Sequence classification model 

In [18]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669M [00:00<?, ?B/s]

# See the tokenizer at work by writing any sentence and getting integers as the output. We can try this out with different sentences to understand the result and the sentiment analysis.

In [19]:
tokens = tokenizer.encode('I think the ending could have been better', return_tensors='pt')       #tensors are set to pytorch
tokens[0]

tensor([  101,   151, 21506, 10103, 25706, 12296, 10574, 10662, 16197,   102])

#Decoding can also be done for the integers.

In [20]:
tokenizer.decode(tokens[0])

'[CLS] i think the ending could have been better [SEP]'

# Spatial token in the beggining CLS and separator - spatial token to separate two sentences. Now, let's look at the results.

In [21]:
result = model(tokens)
result.logits
# tensor results

tensor([[-1.1397,  0.9120,  2.1368,  0.4123, -2.0143]],
       grad_fn=<AddmmBackward0>)

In [22]:
result

SequenceClassifierOutput(loss=None, logits=tensor([[-1.1397,  0.9120,  2.1368,  0.4123, -2.0143]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

#These values represent the one-hot encoded list of scores. The position with the highest score represents the sentiment rating of a ceratin word as shown by the logits above. We will use argmax to get the highest value results.

In [39]:
int(torch.argmax (result.logits))+1

3

#So, the reviews in any website are stored in a class ("Comments") which we wil extract those from their website using regex. I am using an Yelp review from one of my favorite restaurant in Vancouver. Lets get on with webscraping now. :)

In [24]:
r = requests.get("https://www.yelp.com/biz/madras-spice-vancouver?osq=indian+fraser+street")
soup = BeautifulSoup(r.text, 'html.parser')
regex = re.compile('.*comment.*')
results = soup.find_all('p', {'class':regex})
reviews = [result.text for result in results]

# WebScraping
##We grabbed the website from which we get the response code. Then we pass it through the soup and setting up the parser. Now, we select the specific component in the response code i.e. ("comment"). Passing that selection in soup which will find all the tags with the comment class. Results.text just shows the text and omits the html tags.

#Let's look at the text.

In [25]:
reviews

['The parathas are beyond description. Flaky & chewy & fresh, fresh, fresh off the pan. Samosas are flaky & delicious! The Lamb Curry & Madras Chicken were perfect. Not many places compare with our favorite place in the world - All India Cafe in Pasadena, California. This one does! Fresh & well priced!Oh. The waitress was so sweet in bantering with my Father in law who, in aging, has only stories of his 15 years in India. Again - oh! the parathas were divine!',
 'This is one of the best South Indian must try place in Vancouver. Their rava Dosa , Masala Dosa chicken 65 , Thali \xa0which I have tried \xa0are must try dishes. Chef raja is very nice humble . Prices are very reasonable, and service is fast .',
 "I've only really tried their vegetable samosas but they were so delicious I've gotta write about them. I walked in and ordered and they were ready within five minutes for takeout. They're three mini samosas with two sauces. It was really the crust and the samosa wrapping that blew m

# Load reviews in a DF and see the sentiment results.

In [41]:
import pandas as pd
import numpy as np
review_df = pd.DataFrame(np.array(reviews), columns =['review'])

In [42]:
review_df.head(10)

Unnamed: 0,review
0,The parathas are beyond description. Flaky & c...
1,This is one of the best South Indian must try ...
2,I've only really tried their vegetable samosas...
3,Ordered some take-out from here one evening af...
4,New Indian restaurant along the busy street of...
5,The best spot for authentic South Indian food ...
6,This is an amazing restaurant and the food was...
7,Been here a few times here but our go to is bu...
8,Ordered the Mutton biriyani and Madras spice c...
9,"Their dosas are pretty yummy, along with their..."


In [43]:
review_df['review'].iloc[2]               #One of the reviews

"I've only really tried their vegetable samosas but they were so delicious I've gotta write about them. I walked in and ordered and they were ready within five minutes for takeout. They're three mini samosas with two sauces. It was really the crust and the samosa wrapping that blew me away. Usually it's the samosa filling that is the star of the show, but the crust! It didn't blend in with the filling like crusts usually do. It was the perfect amount of thickness, it really felt like it was a pocket to hold the filling together. It was so delightful biting into it because the texture and chewiness was perfect. The filling was incredible as well. 5/5 samosa."

# Create a function to call the model.

In [44]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [45]:
sentiment_score(review_df['review'].iloc[1])

5

#Lambda goes through each of the review in the dataframe.

In [46]:
review_df['sentiment'] = review_df['review'].apply(lambda x: sentiment_score(x[:512]))                     #512 represents the NLP limit in this case

In [47]:
review_df

Unnamed: 0,review,sentiment
0,The parathas are beyond description. Flaky & c...,5
1,This is one of the best South Indian must try ...,5
2,I've only really tried their vegetable samosas...,5
3,Ordered some take-out from here one evening af...,2
4,New Indian restaurant along the busy street of...,4
5,The best spot for authentic South Indian food ...,5
6,This is an amazing restaurant and the food was...,5
7,Been here a few times here but our go to is bu...,5
8,Ordered the Mutton biriyani and Madras spice c...,3
9,"Their dosas are pretty yummy, along with their...",5


#So, the restaurant mostly has positive reviews. We can do this exercise for any business using webscraping.