In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from textblob import TextBlob
import torch
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the VADER lexicon
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon
nltk.download('vader_lexicon')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\misha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\misha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Create the sentiment labels

#### VADER

In [2]:
# Initialize VADER Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

##### Apply VADER to the hotel reviews

In [3]:
df = pd.read_csv('../data/dataset_cleaned.csv')

# Apply VADER to each review and get the compound sentiment score
df['vader_score'] = df['review_text'].apply(lambda x: sia.polarity_scores(str(x)))
df['compound_score'] = df['vader_score'].apply(lambda x: x['compound'])

# Sentiment categorization based on VADER compound score
df['vader_category'] = df['compound_score'].apply(lambda x: 'Positive' if x >= 0.05 else 'Negative' if x <= -0.05 else 'Neutral')

#### Flair

In [4]:
from typing import Tuple
from flair.data import Sentence
from flair.nn import Classifier

def analyze_sentiment_apply(df, column_name):
    tagger = Classifier.load('sentiment')
    
    def get_sentiment(text) -> Tuple[str, float]:
        sentence = Sentence(str(text))
        tagger.predict(sentence)
        return (sentence.labels[0].value, sentence.labels[0].score)
    
    # Apply the function and create two new columns
    df[['flair_sentiment', 'flair_confidence']] = df[column_name].apply(get_sentiment).apply(pd.Series)
    return df

In [5]:
df = analyze_sentiment_apply(df, 'review_text')

#### TextBlob

In [6]:
df['textblob_sentiment'] = df['review_text'].apply(lambda x: TextBlob(x).sentiment)

In [7]:
# map textblob_sentiment column to postive, neutral or negative
df['textblob_sentiment'] = df['textblob_sentiment'].apply(lambda x: 'positive' if x.polarity > 0 else 'negative' if x.polarity < 0 else 'neutral')

### Voting Scheme

#### Majority Vote

In [8]:
df.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,...,whitespace_tokens,rule_based_tokens,spacy_tokens,wordpiece_tokens,vader_score,compound_score,vader_category,flair_sentiment,flair_confidence,textblob_sentiment
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,...,"['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","{'neg': 0.0, 'neu': 0.389, 'pos': 0.611, 'comp...",0.9868,Positive,POSITIVE,0.943697,positive
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,...,"['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","{'neg': 0.173, 'neu': 0.762, 'pos': 0.066, 'co...",-0.399,Negative,POSITIVE,0.590723,negative
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,...,"['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","{'neg': 0.079, 'neu': 0.334, 'pos': 0.587, 'co...",0.9796,Positive,POSITIVE,0.922921,positive
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,"['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","{'neg': 0.0, 'neu': 0.68, 'pos': 0.32, 'compou...",0.9468,Positive,POSITIVE,0.996439,positive
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,"['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","{'neg': 0.024, 'neu': 0.522, 'pos': 0.455, 'co...",0.9961,Positive,POSITIVE,0.991161,positive


In [40]:
from collections import Counter

def majority_vote(votes):
  """
  Takes a list of sentiment labels and returns the majority sentiment.
  """
  count = Counter(votes)
  most_common, count = count.most_common(1)[0]

  # If the most common sentiment has more than half the votes, return it
  if count > len(votes) / 2:
    return most_common
  else:
    return "neutral"

In [51]:
df.columns

Index(['helpfulVotes', 'id', 'address', 'city', 'placeName', 'numberOfReviews',
       'placeRating', 'webUrl', 'publishedDate', 'userRating', 'roomTip',
       'review_text', 'review_title', 'travelDate', 'tripType', 'username',
       'word_tokens', 'bpe_tokens', 'bigram_tokens', 'trigram_tokens',
       'whitespace_tokens', 'rule_based_tokens', 'spacy_tokens',
       'wordpiece_tokens', 'vader_score', 'compound_score', 'vader_category',
       'flair_sentiment', 'flair_confidence', 'textblob_sentiment'],
      dtype='object')

In [52]:
# standardize all the sentiment columns
for col in ['vader_category', 'flair_sentiment', 'textblob_sentiment']:
    df[col] = df[col].str.lower()

In [53]:
df[['vader_category', 'flair_sentiment', 'textblob_sentiment']].value_counts()

vader_category  flair_sentiment  textblob_sentiment
positive        positive         positive              10204
                negative         positive                982
negative        negative         negative                311
                                 positive                123
positive        negative         negative                109
                positive         negative                 18
negative        positive         positive                 14
neutral         negative         positive                 11
                                 negative                 10
negative        positive         negative                  9
neutral         positive         positive                  6
negative        negative         neutral                   3
positive        negative         neutral                   3
                positive         neutral                   3
neutral         positive         negative                  1
                                 

In [None]:
df['ground_truth'] = df.apply(lambda x: majority_vote([x['vader_category'], x['flair_sentiment'], x['textblob_sentiment']]), axis=1)

In [None]:
df[['vader_category', 'flair_sentiment', 'textblob_sentiment', 'ground_truth']].value_counts()

vader_category  flair_sentiment  textblob_sentiment  majority_vote
positive        positive         positive            positive         10204
                negative         positive            positive           982
negative        negative         negative            negative           311
                                 positive            negative           123
positive        negative         negative            negative           109
                positive         negative            positive            18
negative        positive         positive            positive            14
neutral         negative         positive            neutral             11
                                 negative            negative            10
negative        positive         negative            negative             9
neutral         positive         positive            positive             6
negative        negative         neutral             negative             3
positive        negat

In [None]:
df['ground_truth'].value_counts()

majority_vote
positive    11227
negative      565
neutral        16
Name: count, dtype: int64

In [61]:
df.to_csv('../data/processed/labelled_data.csv', index=False)