In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from textblob import TextBlob
import torch
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/dataset_cleaned.csv")
df.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,...,tripType,username,word_tokens,bpe_tokens,bigram_tokens,trigram_tokens,whitespace_tokens,rule_based_tokens,spacy_tokens,wordpiece_tokens
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,...,SOLO,219nikal,"['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","[('good', 'service'), ('service', 'ayu'), ('ay...","[('good', 'service', 'ayu'), ('service', 'ayu'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'..."
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,...,BUSINESS,rajacool1984itz,"['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","[('change', 'r'), ('r', 'lady'), ('lady', 'man...","[('change', 'r', 'lady'), ('r', 'lady', 'manag...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel..."
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,...,FAMILY,857navidj,"['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","[('perfect', 'liked'), ('liked', 'everything')...","[('perfect', 'liked', 'everything'), ('liked',...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k..."
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,FAMILY,809mickaelt,"['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","[('stay', 'unforgettable'), ('unforgettable', ...","[('stay', 'unforgettable', 'hotel'), ('unforge...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'..."
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,FAMILY,M1879HRchloet,"['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","[('amazing', 'hotel'), ('hotel', 'well'), ('we...","[('amazing', 'hotel', 'well'), ('hotel', 'well...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo..."


In [3]:
df.columns.tolist()

['helpfulVotes',
 'id',
 'address',
 'city',
 'placeName',
 'numberOfReviews',
 'placeRating',
 'webUrl',
 'publishedDate',
 'userRating',
 'roomTip',
 'review_text',
 'review_title',
 'travelDate',
 'tripType',
 'username',
 'word_tokens',
 'bpe_tokens',
 'bigram_tokens',
 'trigram_tokens',
 'whitespace_tokens',
 'rule_based_tokens',
 'spacy_tokens',
 'wordpiece_tokens']

TextBlob Sentiment

In [4]:
df['textblob_sentiment'] = df['review_text'].apply(lambda x: TextBlob(x).sentiment)

In [5]:
df.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,...,username,word_tokens,bpe_tokens,bigram_tokens,trigram_tokens,whitespace_tokens,rule_based_tokens,spacy_tokens,wordpiece_tokens,textblob_sentiment
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,...,219nikal,"['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","[('good', 'service'), ('service', 'ayu'), ('ay...","[('good', 'service', 'ayu'), ('service', 'ayu'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","(0.47045454545454546, 0.6909090909090909)"
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,...,rajacool1984itz,"['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","[('change', 'r'), ('r', 'lady'), ('lady', 'man...","[('change', 'r', 'lady'), ('r', 'lady', 'manag...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","(-0.23333333333333334, 0.3333333333333333)"
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,...,857navidj,"['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","[('perfect', 'liked'), ('liked', 'everything')...","[('perfect', 'liked', 'everything'), ('liked',...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","(0.5700000000000001, 0.7700000000000001)"
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,809mickaelt,"['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","[('stay', 'unforgettable'), ('unforgettable', ...","[('stay', 'unforgettable', 'hotel'), ('unforge...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","(0.31100000000000005, 0.5439999999999999)"
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,M1879HRchloet,"['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","[('amazing', 'hotel'), ('hotel', 'well'), ('we...","[('amazing', 'hotel', 'well'), ('hotel', 'well...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","(0.4038302277432713, 0.6514492753623188)"


In [6]:
# map textblob_sentiment column to postive, neutral or negative
df['textblob_sentiment'] = df['textblob_sentiment'].apply(lambda x: 'positive' if x.polarity > 0 else 'negative' if x.polarity < 0 else 'neutral')

In [7]:
df.head()

Unnamed: 0,helpfulVotes,id,address,city,placeName,numberOfReviews,placeRating,webUrl,publishedDate,userRating,...,username,word_tokens,bpe_tokens,bigram_tokens,trigram_tokens,whitespace_tokens,rule_based_tokens,spacy_tokens,wordpiece_tokens,textblob_sentiment
0,0.0,978474125.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-11-03,5.0,...,219nikal,"['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","[('good', 'service'), ('service', 'ayu'), ('ay...","[('good', 'service', 'ayu'), ('service', 'ayu'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...","['good', 'service', 'ayu', 'friendly', 'staff'...",positive
1,0.0,978053018.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-30,1.0,...,rajacool1984itz,"['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","[('change', 'r'), ('r', 'lady'), ('lady', 'man...","[('change', 'r', 'lady'), ('r', 'lady', 'manag...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...","['change', 'r', 'lady', 'manager', 'r', 'hotel...",negative
2,0.0,976992067.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-28,5.0,...,857navidj,"['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","[('perfect', 'liked'), ('liked', 'everything')...","[('perfect', 'liked', 'everything'), ('liked',...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...","['perfect', 'liked', 'everything', 'staff', 'k...",positive
3,0.0,976690540.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,809mickaelt,"['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","[('stay', 'unforgettable'), ('unforgettable', ...","[('stay', 'unforgettable', 'hotel'), ('unforge...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...","['stay', 'unforgettable', 'hotel', 'beautiful'...",positive
4,0.0,976664122.0,"24 Dharmapala Mawatha Dharmapala Mawatha, Colo...",Colombo,Nh Collection Colombo,2392,4.5,https://www.tripadvisor.com/Hotel_Review-g2939...,2024-10-26,5.0,...,M1879HRchloet,"['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","[('amazing', 'hotel'), ('hotel', 'well'), ('we...","[('amazing', 'hotel', 'well'), ('hotel', 'well...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...","['amazing', 'hotel', 'well', 'situated', 'colo...",positive


Flair Sentiment

In [8]:
# ! pip install flair

In [9]:
from typing import Tuple

def analyze_sentiment_apply(df, column_name):
    tagger = Classifier.load('sentiment')
    
    def get_sentiment(text) -> Tuple[str, float]:
        sentence = Sentence(str(text))
        tagger.predict(sentence)
        return (sentence.labels[0].value, sentence.labels[0].score)
    
    # Apply the function and create two new columns
    df[['flair_sentiment', 'flair_confidence']] = df[column_name].apply(get_sentiment).apply(pd.Series)
    return df

In [10]:
df = analyze_sentiment_apply(df, 'review_text')

NameError: name 'Classifier' is not defined

In [None]:
df.head()

In [None]:
df.textblob_sentiment.value_counts()

In [None]:
df.flair_sentiment.value_counts()