# Extracting the sentiment polarity from the tourist reviews in Vader & Roberta Model

In [1]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [2]:
# Read in data
df = pd.read_csv('reviews1.csv')
print(df.shape)

(7271, 5)


**For the security purpose, drop the name column to maintain the privacy policy**

In [3]:
#dropping the one column
df.drop(['name'], axis = 1, inplace=True)

In [4]:
df.head()

Unnamed: 0,ID,location,total review,review
0,1,Syambhunath,46 reviews,It is at the top of valleys mountain. Best pl...
1,2,Syambhunath,132 reviews,This place has a significant importance in Bud...
2,3,Syambhunath,298 reviews,Visited this from the other side on a rainy ev...
3,4,Syambhunath,247 reviews,A beautiful temple situated in the capital wit...
4,5,Syambhunath,69 reviews,"great, beautiful, historic & religious place....."


In [5]:
#randomly check one example of review from the dataset
example = df['review'][560]
print(example)

Great place to visit. A UNESCO listed world heritage site. It Offers great birds eye view of KTM valley.
Climbing stairs will also make you helathy😂.
Deserves full 5 stars


In [6]:
#NLTK can tokenize the setence which splits all the words separately
tokens = nltk.word_tokenize(example)
tokens[:10]

['Great',
 'place',
 'to',
 'visit',
 '.',
 'A',
 'UNESCO',
 'listed',
 'world',
 'heritage']

In [7]:
#Part of speech tagging
tagged = nltk.pos_tag(tokens)
tagged[:10]

[('Great', 'JJ'),
 ('place', 'NN'),
 ('to', 'TO'),
 ('visit', 'VB'),
 ('.', '.'),
 ('A', 'DT'),
 ('UNESCO', 'NNP'),
 ('listed', 'VBN'),
 ('world', 'NN'),
 ('heritage', 'NN')]

In [8]:
# Drop rows with any empty cells
df.dropna(
    axis=0,
    how='any',
    thresh=None,
    subset=None,
    inplace=True
)

In [9]:
df

Unnamed: 0,ID,location,total review,review
0,1,Syambhunath,46 reviews,It is at the top of valleys mountain. Best pl...
1,2,Syambhunath,132 reviews,This place has a significant importance in Bud...
2,3,Syambhunath,298 reviews,Visited this from the other side on a rainy ev...
3,4,Syambhunath,247 reviews,A beautiful temple situated in the capital wit...
4,5,Syambhunath,69 reviews,"great, beautiful, historic & religious place....."
...,...,...,...,...
7266,7267,Pokhara,9 reviews,"It's a nice place to sit back, and enjoy. The ..."
7267,7268,Pokhara,3 reviews,"Excellent Place to visit, Lifetime memories"
7268,7269,Pokhara,79 reviews,It's very photogenic and relaxing when there a...
7269,7270,Pokhara,14 reviews,U can get real definition of nature's beauty a...


**All the rows which has any empty cells are removed now**

In [10]:
#maxent_ne_chunker contains two pre-trained English named entity chunkers trained on an ACE corpus
nltk.download('maxent_ne_chunker')


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\44758\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [11]:
nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

(S
  (GPE Great/JJ)
  place/NN
  to/TO
  visit/VB
  ./.
  A/DT
  (ORGANIZATION UNESCO/NNP)
  listed/VBN
  world/NN
  heritage/NN
  site/NN
  ./.
  It/PRP
  Offers/VBZ
  great/JJ
  birds/NNS
  eye/NN
  view/NN
  of/IN
  (ORGANIZATION KTM/NNP)
  valley/NN
  ./.
  Climbing/VBG
  stairs/NNS
  will/MD
  also/RB
  make/VB
  you/PRP
  helathy😂/VB
  ./.
  Deserves/VBZ
  full/JJ
  5/CD
  stars/NNS)


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\44758\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [12]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [13]:
#show the polarity in terms of negative, neutral, positive and compound
sia.polarity_scores(example)


{'neg': 0.0, 'neu': 0.76, 'pos': 0.24, 'compound': 0.8481}

In [14]:
# Run the polarity score on the entire dataset and store the result in res
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['review']
    myid = row['ID'] 
    res[myid] = str(sia.polarity_scores(text)).encode('utf-8')


  0%|          | 0/7102 [00:00<?, ?it/s]

In [15]:
res

{1: b"{'neg': 0.0, 'neu': 0.633, 'pos': 0.367, 'compound': 0.9738}",
 2: b"{'neg': 0.0, 'neu': 0.635, 'pos': 0.365, 'compound': 0.9614}",
 3: b"{'neg': 0.023, 'neu': 0.69, 'pos': 0.287, 'compound': 0.9468}",
 4: b"{'neg': 0.0, 'neu': 0.67, 'pos': 0.33, 'compound': 0.9531}",
 5: b"{'neg': 0.0, 'neu': 0.577, 'pos': 0.423, 'compound': 0.9468}",
 6: b"{'neg': 0.0, 'neu': 0.682, 'pos': 0.318, 'compound': 0.9261}",
 7: b"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",
 8: b"{'neg': 0.0, 'neu': 0.785, 'pos': 0.215, 'compound': 0.91}",
 9: b"{'neg': 0.0, 'neu': 0.553, 'pos': 0.447, 'compound': 0.9527}",
 10: b"{'neg': 0.0, 'neu': 0.743, 'pos': 0.257, 'compound': 0.8555}",
 11: b"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'compound': 0.5423}",
 12: b"{'neg': 0.0, 'neu': 0.957, 'pos': 0.043, 'compound': 0.2023}",
 13: b"{'neg': 0.0, 'neu': 0.966, 'pos': 0.034, 'compound': 0.0772}",
 14: b"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'compound': 0.7184}",
 15: b"{'neg': 0.0, 'neu': 0.782, 'pos'

In [16]:
#pd.DataFrame(res)
#pd.Series(res).to_frame()
res = pd.DataFrame(res, index=['polarity'])
res.T


Unnamed: 0,polarity
1,"b""{'neg': 0.0, 'neu': 0.633, 'pos': 0.367, 'co..."
2,"b""{'neg': 0.0, 'neu': 0.635, 'pos': 0.365, 'co..."
3,"b""{'neg': 0.023, 'neu': 0.69, 'pos': 0.287, 'c..."
4,"b""{'neg': 0.0, 'neu': 0.67, 'pos': 0.33, 'comp..."
5,"b""{'neg': 0.0, 'neu': 0.577, 'pos': 0.423, 'co..."
...,...
7267,"b""{'neg': 0.0, 'neu': 0.702, 'pos': 0.298, 'co..."
7268,"b""{'neg': 0.0, 'neu': 0.575, 'pos': 0.425, 'co..."
7269,"b""{'neg': 0.0, 'neu': 0.743, 'pos': 0.257, 'co..."
7270,"b""{'neg': 0.0, 'neu': 0.523, 'pos': 0.477, 'co..."


In [17]:
#Lets merge this into our original dataset and add a columns for polarity in left side with the name column vader
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'ID'})
vaders = vaders.merge(df, how='left')

In [18]:
vaders

Unnamed: 0,ID,polarity,location,total review,review
0,1,"b""{'neg': 0.0, 'neu': 0.633, 'pos': 0.367, 'co...",Syambhunath,46 reviews,It is at the top of valleys mountain. Best pl...
1,2,"b""{'neg': 0.0, 'neu': 0.635, 'pos': 0.365, 'co...",Syambhunath,132 reviews,This place has a significant importance in Bud...
2,3,"b""{'neg': 0.023, 'neu': 0.69, 'pos': 0.287, 'c...",Syambhunath,298 reviews,Visited this from the other side on a rainy ev...
3,4,"b""{'neg': 0.0, 'neu': 0.67, 'pos': 0.33, 'comp...",Syambhunath,247 reviews,A beautiful temple situated in the capital wit...
4,5,"b""{'neg': 0.0, 'neu': 0.577, 'pos': 0.423, 'co...",Syambhunath,69 reviews,"great, beautiful, historic & religious place....."
...,...,...,...,...,...
7097,7267,"b""{'neg': 0.0, 'neu': 0.702, 'pos': 0.298, 'co...",Pokhara,9 reviews,"It's a nice place to sit back, and enjoy. The ..."
7098,7268,"b""{'neg': 0.0, 'neu': 0.575, 'pos': 0.425, 'co...",Pokhara,3 reviews,"Excellent Place to visit, Lifetime memories"
7099,7269,"b""{'neg': 0.0, 'neu': 0.743, 'pos': 0.257, 'co...",Pokhara,79 reviews,It's very photogenic and relaxing when there a...
7100,7270,"b""{'neg': 0.0, 'neu': 0.523, 'pos': 0.477, 'co...",Pokhara,14 reviews,U can get real definition of nature's beauty a...


# Roberta Model Sentiment Analysis

In [19]:
!pip install transformers



In [20]:
#import all the required libraries
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [21]:
# VADER results on example
print(example)
sia.polarity_scores(example)

Great place to visit. A UNESCO listed world heritage site. It Offers great birds eye view of KTM valley.
Climbing stairs will also make you helathy😂.
Deserves full 5 stars


{'neg': 0.0, 'neu': 0.76, 'pos': 0.24, 'compound': 0.8481}

In [22]:
!pip install torch



In [23]:
conda install pytorch torchvision -c pytorch

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.




  current version: 22.9.0
  latest version: 22.11.0

Please update conda by running

    $ conda update -n base -c defaults conda




In [24]:
#Pre trained model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [25]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores

array([0.00250564, 0.02825481, 0.9692397 ], dtype=float32)

In [26]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [27]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['review']
        myid = row['ID']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

  0%|          | 0/7102 [00:00<?, ?it/s]

In [28]:
 scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

{'roberta_neg': 0.0025056351, 'roberta_neu': 0.028254809, 'roberta_pos': 0.9692397}


In [29]:
both

{'vader_neg': 0.0,
 'vader_neu': 0.161,
 'vader_pos': 0.839,
 'vader_compound': 0.886,
 'roberta_neg': 0.0043772506,
 'roberta_neu': 0.053909976,
 'roberta_pos': 0.94171274}

In [30]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'ID'})
results_df = results_df.merge(df, how='left')

In [31]:
results_df

Unnamed: 0,ID,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,location,total review,review
0,1,0.000,0.633,0.367,0.9738,0.002081,0.016537,0.981382,Syambhunath,46 reviews,It is at the top of valleys mountain. Best pl...
1,2,0.000,0.635,0.365,0.9614,0.001202,0.045450,0.953348,Syambhunath,132 reviews,This place has a significant importance in Bud...
2,3,0.023,0.690,0.287,0.9468,0.003178,0.033734,0.963088,Syambhunath,298 reviews,Visited this from the other side on a rainy ev...
3,4,0.000,0.670,0.330,0.9531,0.001387,0.020570,0.978043,Syambhunath,247 reviews,A beautiful temple situated in the capital wit...
4,5,0.000,0.577,0.423,0.9468,0.001907,0.031852,0.966241,Syambhunath,69 reviews,"great, beautiful, historic & religious place....."
...,...,...,...,...,...,...,...,...,...,...,...
7097,7267,0.000,0.702,0.298,0.9432,0.001783,0.022330,0.975886,Pokhara,9 reviews,"It's a nice place to sit back, and enjoy. The ..."
7098,7268,0.000,0.575,0.425,0.5719,0.002391,0.034836,0.962773,Pokhara,3 reviews,"Excellent Place to visit, Lifetime memories"
7099,7269,0.000,0.743,0.257,0.5367,0.003553,0.050240,0.946207,Pokhara,79 reviews,It's very photogenic and relaxing when there a...
7100,7270,0.000,0.523,0.477,0.8074,0.003837,0.157588,0.838574,Pokhara,14 reviews,U can get real definition of nature's beauty a...


**By now, we have got all the sentiment polarity of VADER and also the RoBERTa model in our dataset** 

In [32]:
#If-else condition to filter the sentiment based on roberta negative, neutral and positive

conditions = [
    (results_df['roberta_neg']> 0.40),
    (results_df['roberta_neu']>0.50 ),
    (results_df['roberta_pos']>0.60)]

values = ['-1', '0', '1']
results_df['sentiment']= np.select(conditions, values)

In [33]:
#Since the RoBERTa model has higher accuracy and precision, we take only the sentiment based pn roberts_neg, roberts_neu and roberta_pos
results_df

Unnamed: 0,ID,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,location,total review,review,sentiment
0,1,0.000,0.633,0.367,0.9738,0.002081,0.016537,0.981382,Syambhunath,46 reviews,It is at the top of valleys mountain. Best pl...,1
1,2,0.000,0.635,0.365,0.9614,0.001202,0.045450,0.953348,Syambhunath,132 reviews,This place has a significant importance in Bud...,1
2,3,0.023,0.690,0.287,0.9468,0.003178,0.033734,0.963088,Syambhunath,298 reviews,Visited this from the other side on a rainy ev...,1
3,4,0.000,0.670,0.330,0.9531,0.001387,0.020570,0.978043,Syambhunath,247 reviews,A beautiful temple situated in the capital wit...,1
4,5,0.000,0.577,0.423,0.9468,0.001907,0.031852,0.966241,Syambhunath,69 reviews,"great, beautiful, historic & religious place.....",1
...,...,...,...,...,...,...,...,...,...,...,...,...
7097,7267,0.000,0.702,0.298,0.9432,0.001783,0.022330,0.975886,Pokhara,9 reviews,"It's a nice place to sit back, and enjoy. The ...",1
7098,7268,0.000,0.575,0.425,0.5719,0.002391,0.034836,0.962773,Pokhara,3 reviews,"Excellent Place to visit, Lifetime memories",1
7099,7269,0.000,0.743,0.257,0.5367,0.003553,0.050240,0.946207,Pokhara,79 reviews,It's very photogenic and relaxing when there a...,1
7100,7270,0.000,0.523,0.477,0.8074,0.003837,0.157588,0.838574,Pokhara,14 reviews,U can get real definition of nature's beauty a...,1


In [34]:
results_df.drop(columns = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'roberta_neg', 'roberta_neu', 'roberta_pos', 'total review'], axis=1, inplace=True)


In [35]:
#Let's check the 20 rows using the iloc function.
results_df.iloc[:20]

Unnamed: 0,ID,location,review,sentiment
0,1,Syambhunath,It is at the top of valleys mountain. Best pl...,1
1,2,Syambhunath,This place has a significant importance in Bud...,1
2,3,Syambhunath,Visited this from the other side on a rainy ev...,1
3,4,Syambhunath,A beautiful temple situated in the capital wit...,1
4,5,Syambhunath,"great, beautiful, historic & religious place.....",1
5,6,Syambhunath,The view is so pleased and pleasures that it t...,1
6,7,Syambhunath,It is a must visit heritage site in kathmandu....,1
7,8,Syambhunath,The Swayambhunath Stupa is one of the crowning...,1
8,9,Syambhunath,It’s up there but would definitely recommend d...,1
9,10,Syambhunath,One of the best stupa in Kathmandu. Its locate...,1


In [36]:
#Save the dataset with ID, review and sentiment column for the further sentiment analysis 
results_df.to_csv('final roberta sentiment.csv', index=False)
