## imports

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import math

## Sentiment Score

In [23]:
def sentiment_analysis(text):
    tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    tokens = tokenizer.encode(text, return_tensors='pt')
    result = model(tokens)
    score = int(torch.argmax(result.logits)) + 1
    
    return score

## Reviews

In [3]:
def get_reviews(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    reviews = soup.find_all('div', class_='text show-more__control')
    
    # putting each review into a list
    lst_reviews = []
    for review in reviews:
        lst_reviews.append(review.text)
    
    
    
    return lst_reviews

## Fetching Link

In [33]:
link = 'https://www.imdb.com/title/tt1877830/reviews?ref_=tt_urv'

reviews = get_reviews(link)


# creating dataframe of reviews
df = pd.DataFrame(np.array(reviews), columns=['reviews'])
df


Unnamed: 0,reviews
0,THE BATMAN (2022) is a movie where my expectat...
1,STAR RATING: ***** Brilliant **** Very Good **...
2,Have just came back from a screening of this &...
3,I never cared much for The Riddler and I don't...
4,Everything about this movie is trying too hard...
5,Matt Reeves is not a good director at all-- -h...
6,Couldn't even finish this drivel... Firstly : ...
7,Robert P is a terrible Batman. Absolutely no B...
8,I went in with ultra-low expectations because ...
9,"I just don't enjoyed it at all. It's too long,..."


In [30]:
# loop through each row in reviews column and apply to sentiment analysis
score_lst = []

for review in df['reviews']:
    #if length of review is greater than 512, then split into 512 character chunks
    if len(review) > 512:
        num = int(len(review) / 512)
        num = round(num)
        new_lst = []
        #split review by num of 512 characters and append to new list
        for i in range(1, num + 1):
            new_lst.append(review[(i - 1) * 512: i * 512])
        # get sentiment score for each chunk and add scores to new list and find average and round to nearest integer and append to score_lst
        split_score_lst = []
        for i in new_lst:
            split_score_lst.append(sentiment_analysis(i))
        score = round(sum(split_score_lst) / len(split_score_lst))
        score_lst.append(score)
    else:
        review_score = sentiment_analysis(review)
        score_lst.append(review_score)
        




## Output

In [34]:
score_lst_array = np.array(score_lst)

# add column to df
df['sentiment'] = score_lst_array
df

Unnamed: 0,reviews,sentiment
0,THE BATMAN (2022) is a movie where my expectat...,2
1,STAR RATING: ***** Brilliant **** Very Good **...,3
2,Have just came back from a screening of this &...,2
3,I never cared much for The Riddler and I don't...,2
4,Everything about this movie is trying too hard...,2
5,Matt Reeves is not a good director at all-- -h...,2
6,Couldn't even finish this drivel... Firstly : ...,1
7,Robert P is a terrible Batman. Absolutely no B...,1
8,I went in with ultra-low expectations because ...,2
9,"I just don't enjoyed it at all. It's too long,...",2
