Based on https://www.youtube.com/watch?v=szczpgOEdXs&t=111s

# 1. Install and Import Dependencies

In [None]:
!pip install torch==1.9.0+cu102 torchvision==0.10.0+cu102 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

In [1]:
!pip install transformers requests beautifulsoup4 pandas numpy

Collecting transformers
  Downloading transformers-4.7.0-py3-none-any.whl (2.5 MB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.45-py3-none-any.whl (895 kB)
Collecting huggingface-hub==0.0.8
  Using cached huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.8 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.7.0


In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

# 2. Instantiate Model

https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

In [3]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669M [00:00<?, ?B/s]

# 3. Encode and Calculate Sentiment

In [4]:
tokens = tokenizer.encode('I hated this, absoluetly the worst', return_tensors='pt')

In [7]:
tokens[0]

tensor([  101,   151, 39487, 10163, 10372,   117, 63267, 49657, 10103, 43060,
          102])

In [8]:
tokenizer.decode(tokens[0])

'[CLS] i hated this, absoluetly the worst [SEP]'

In [9]:
result = model(tokens)

In [11]:
result

SequenceClassifierOutput(loss=None, logits=tensor([[ 4.4387,  2.0377, -0.5344, -2.7994, -2.4315]],
       grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [12]:
result.logits

tensor([[ 4.4387,  2.0377, -0.5344, -2.7994, -2.4315]],
       grad_fn=<AddmmBackward>)

In [13]:
int(torch.argmax(result.logits))+1

1

# 4. Collect Reviews

In [34]:
r = requests.get('https://www.yelp.com/biz/ramen-nagi-santa-clara')
soup = BeautifulSoup(r.text, 'html.parser')
regex = re.compile('.*comment.*')
results = soup.find_all('p', {'class':regex})
reviews = [result.text for result in results]

In [35]:
reviews[0]

"Ramen Nagi, you son of a gun. Why the long lines man?! I'm tired of waiting 30 minutes to eat your deliciousness. Stop being so good at what you offer. Give me the Red King with the normal everything and extra garlic flavor. Give me the thin noodles firm because I like it like that. Toss in a soft boiled eggs and let me appreciate it because I tried last night and all but one came out not the way I see it here. Oh oh. Can't forget the karaage! Jeehs! Your appetizer is delicious too. Long line, I'm forced to wait to get my ramen fix taken care of. Miss you. Had you a day ago. Oh no. I became the crazy ex."

# 5. Load Reviews into DataFrame and Score

In [36]:
df = pd.DataFrame(np.array(reviews), columns=['review'])

In [37]:
df

Unnamed: 0,review
0,"Ramen Nagi, you son of a gun. Why the long lin..."
1,Ramen Nagi wins on all levels for me. Living i...
2,"Went here on a weekday right when it opened, a..."
3,This spot was definitely worth the hype. I wai...
4,Just adding onto the plethora of reviews ravin...
5,THERE IS ALMOST ALWAYS A LINE AT THIS PLACE BU...
6,It was my pleasure trying Ramen Nagi's Red Kin...
7,This spot has all the umami flavors but it can...
8,Definitely one of the better ramen places in S...
9,Pros - Must try place for ramen in the Bay Are...


In [38]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [39]:
sentiment_score(df['review'].iloc[0])

2

In [40]:
df['sentiment'] = df['review'].apply(lambda x: sentiment_score(x[:512]))  #max 512 tokens for this NLP model

In [41]:
df

Unnamed: 0,review,sentiment
0,"Ramen Nagi, you son of a gun. Why the long lin...",1
1,Ramen Nagi wins on all levels for me. Living i...,5
2,"Went here on a weekday right when it opened, a...",4
3,This spot was definitely worth the hype. I wai...,5
4,Just adding onto the plethora of reviews ravin...,5
5,THERE IS ALMOST ALWAYS A LINE AT THIS PLACE BU...,5
6,It was my pleasure trying Ramen Nagi's Red Kin...,4
7,This spot has all the umami flavors but it can...,4
8,Definitely one of the better ramen places in S...,5
9,Pros - Must try place for ramen in the Bay Are...,5


# 6. Filmweb Version (not working coz in polish)

In [51]:
r = requests.get('https://www.filmweb.pl/film/Cruella-2021-754536/discussion')
soup = BeautifulSoup(r.text, 'html.parser')
regex = re.compile('.*forumSection__contentWrapper.*')
results = soup.find_all('div', {'class':regex})
reviews = [result.text for result in results]

In [52]:
reviews[0]

'Dla mnie Cruella jest tylko jedna!'

In [50]:
results[0]

<li class="forumSection__item filmCategory" id="topic_3221482"><div class="forumSection__topRow"><div class="forumSection__avatar"><div class="userAvatar" data-id="4832772" data-image="" data-nick="Mickey1993"><a class="userAvatar__body" href="/user/Mickey1993"> <span class="userAvatar__imageWrap"> <img alt="Mickey1993" class="userAvatar__image" src="https://fwcdn.pl/front/ogfx/t.gif"/> </span> </a></div></div><div class="forumSection__right"><h3 class="forumSection__topicTitle"><a class="forumSection__itemLink" href=" /film/Cruella-2021-754536/discussion/Glenn+Close,3221482"> Glenn Close </a></h3><a href="/user/Mickey1993"><span class="forumSection__authorName" data-uname="Mickey1993" rel="4832772">Mickey1993</span></a></div></div><div class="forumSection__contentWrapper"><a class="forumSection__itemLink" href=" /film/Cruella-2021-754536/discussion/Glenn+Close,3221482"></a><p class="forumSection__topicText">Dla mnie Cruella jest tylko jedna!</p></div><div class="forumSection__bottomRo

In [53]:
df2 = pd.DataFrame(np.array(reviews), columns=['review'])
df2['sentiment'] = df2['review'].apply(lambda x: sentiment_score(x[:512]))  #max 512 tokens for this NLP model
df2

Unnamed: 0,review,sentiment
0,Dla mnie Cruella jest tylko jedna!,5
1,"Bardzo przyjemny film, gra aktorska mi się bar...",4
2,Nie bardzo wiem do kogo jest skierowany ten fi...,2
3,Jeden jedyny konkretny powód - Emma!,1
4,Na zwiastunie za bardzo kojarzy mi się z Harle...,3
5,rozrywkowe!,5
6,"Widzę, że niektórzy z miejsca skreślają ten fi...",2
7,"Londyn, lata 70te, punk, kostiumy, utwory muzy...",2
8,"Widzę, że od jakiegoś czasu twórcy filmowi nur...",4
9,ale szału nie ma.,3


# 7. IMDb

In [54]:
r = requests.get('https://www.imdb.com/title/tt3228774/reviews')
soup = BeautifulSoup(r.text, 'html.parser')
regex = re.compile('.*text show-more__control.*')
results = soup.find_all('div', {'class':regex})
reviews = [result.text for result in results]


In [55]:
df3 = pd.DataFrame(np.array(reviews), columns=['review'])
df3['sentiment'] = df3['review'].apply(lambda x: sentiment_score(x[:512]))  #max 512 tokens for this NLP model
df3

Unnamed: 0,review,sentiment
0,"To be really honest, I had very low positive t...",5
1,It is one of the best villain origin stories t...,5
2,..the best movie I've seen in a decade. And I ...,5
3,Can you make an entertaining film by blending ...,2
4,When I heard that they we're going to make a l...,4
5,My faith in Walt is restored...Disney that is....,5
6,"89/100Cruella is simply put, phenomenal. A bea...",5
7,Cruella is full of style and a pretty good tim...,4
8,"*spoiler alert*\nLoved the scene, when cruella...",5
9,I think there are a lot of bots trying ti unde...,3
