In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from rake_nltk import Rake

  from .autonotebook import tqdm as notebook_tqdm


## EDA

In [3]:
comments = pd.read_csv('data/RatingsComments.csv')
comments.count()

name      125767
rating    125767
review    125766
dtype: int64

In [4]:
new_c = comments.drop_duplicates(subset=['review'])
new_c.to_csv(f'data/RatingsComments.csv', index=False)

In [5]:
new_c.count()

name      125767
rating    125767
review    125766
dtype: int64

In [6]:
comments.head()

Unnamed: 0,name,rating,review
0,Jalsa,4.0,A beautiful place to dine inThe interiors take...
1,Jalsa,4.0,I was here for dinner with my family on a week...
2,Jalsa,2.0,Its a restaurant near to Banashankari BDA Me a...
3,Jalsa,4.0,We went here on a weekend and one of us had th...
4,Jalsa,5.0,The best thing about the place is its ambiance...


In [7]:
jalsa = comments[comments.name == 'Jalsa']
jalsa_comments = jalsa.review.values

## Sentiment analysis

In [8]:
MODEL = f'cardiffnlp/twitter-roberta-base-sentiment'

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [9]:
def polarity_scores_roberta(comment):
    encoded_text = tokenizer(comment, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [10]:
res = {}
for i, comment in enumerate(tqdm(jalsa_comments, total=len(jalsa_comments))):
    try:
        roberta_result = polarity_scores_roberta(comment)
        res[i] = roberta_result
    except RuntimeError:
        print(f'Broke for id {i}')

100%|██████████| 45/45 [00:04<00:00,  9.59it/s]


In [11]:
results = pd.DataFrame(res).T
results

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
0,0.021544,0.071347,0.907109
1,0.001573,0.021282,0.977145
2,0.54648,0.361986,0.091534
3,0.00108,0.011017,0.987903
4,0.002381,0.018512,0.979107
5,0.003821,0.030121,0.966058
6,0.00118,0.019388,0.979432
7,0.066148,0.31912,0.614731
8,0.001463,0.013353,0.985184
9,0.02402,0.130974,0.845006


In [12]:
top_positive = np.argsort(results.roberta_pos.values)[::-1][:10]
top_negative = np.argsort(results.roberta_neg.values)[::-1][:10]

In [13]:
results.iloc[top_positive]

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
3,0.00108,0.011017,0.987903
44,0.001816,0.010814,0.987369
28,0.001798,0.012388,0.985815
8,0.001463,0.013353,0.985184
15,0.001701,0.014223,0.984075
19,0.001602,0.014444,0.983955
14,0.00127,0.016438,0.982292
32,0.001475,0.01744,0.981084
6,0.00118,0.019388,0.979432
4,0.002381,0.018512,0.979107


In [14]:
jalsa_comments[top_positive[0]]

'We went here on a weekend and one of us had the buffet while two of us took Ala Carte Firstly the ambience and service of this place is great The buffet had a lot of items and the good was good We had a Pumpkin Halwa intm the dessert which was amazing Must try The kulchas are great here Cheers'

In [15]:
jalsa_comments[top_negative[0]]

'For team lunch this was the choice we made to try and unfortunately they disappointed most of us\n\nInitially they served us cold food and looked they were in rush to feed us and kick out lol but this was the feeling we all had\n\nWe raised concern to manager and he started sending hot starters on our table however they started ignoring and became very slow which was odd I wish they could understand that it takes time to eat what is already served\n\nAnyways not so great buffet menu and taste specially during lunch time It may be different at dinner time Well I am not ready to try'

## Keyphrases

In [16]:
def unite(comments: np.array) -> str:
    s = f''
    for comment in comments:
        s += f'{comment} '
    
    return s

In [17]:
rake = Rake()
rake.extract_keywords_from_text(unite(jalsa_comments[top_positive]))
rake.get_ranked_phrases()[:10]

['right food taste 45 ambience 455 quality 45 quantity 45 service 45 chances',
 'time line amazing interior good lighting stuff',
 'disappointed us test quality service staff',
 'must try item egg fried rice',
 'lal mirch ka paneer tikka',
 'us took ala carte firstly',
 'tasty food cheese chilli paratha',
 'asked nice food serve biryani',
 'lip smacking chicken platter',
 'bhutta palak methi curry']

In [18]:
rake = Rake()
rake.extract_keywords_from_text(unite(jalsa_comments[top_negative]))
rake.get_ranked_phrases()[:10]

['try good things firstthe ambience 55 service decent staff attitude didnt turn',
 'ordered starters macchi ajwaini tikka cheese bombs paneer laziz murg achari tikka',
 'lip smacking delicious main course paneer lababdar gosht nihari',
 'karela salad 3 decor 4 restroom',
 'outer ring road marthahalli coming',
 'mughal empire proper sofa seating',
 'us food tasted average nothing exceptional',
 'macchi ajwaini tikka',
 'thus always order alacarte service',
 '5 main course place needs']