In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
from textblob import TextBlob
import torch

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Dataset

In [None]:
# load dataset via shared google drive folder
df_TM_reviews = pd.read_excel('/content/drive/MyDrive/BAP Practice Module/BAP Final Report/Data/Processing/df_reviews_topicmodelling_results.xlsx')

# # Alternative load data via local file upload
#df_TM_review = pd.read_excel('df_reviews_topicmodelling_results.xlsx')

In [None]:
df_TM_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39282 entries, 0 to 39281
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   listing_id          39282 non-null  int64         
 1   id                  39282 non-null  int64         
 2   date                39282 non-null  datetime64[ns]
 3   reviewer_id         39282 non-null  int64         
 4   reviewer_name       39282 non-null  object        
 5   comments_tokens     39282 non-null  object        
 6   topic_distribution  39282 non-null  object        
 7   dominant_topic      39282 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 2.4+ MB


# TextBlob

In [None]:
# Sentiment analysis function using TextBlob
def get_sentiment(text):
    blob = TextBlob(str(text))
    return pd.Series({
        'polarity': blob.sentiment.polarity,           # [-1.0, 1.0]
        'subjectivity': blob.sentiment.subjectivity    # [0.0, 1.0]
    })

# Apply sentiment function
sentiments = df_TM_reviews['comments_tokens'].apply(get_sentiment)

# Merge sentiment results into original dataframe
df_TM_reviews = pd.concat([df_TM_reviews, sentiments], axis=1)

# Categorize the polarity into sentiment labels
df_TM_reviews['sentiment_label'] = df_TM_reviews['polarity'].apply(
    lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral'
)

# Save results to a new Excel file
output_file = 'df_reviews_with_sentiment.xlsx'
df_TM_reviews.to_excel(output_file, index=False)

print(f"Sentiment analysis saved to {output_file}")

Sentiment analysis saved to df_reviews_with_sentiment.xlsx


# BERT (HuggingFace Transformers)

In [None]:
# load pre-trained BERT sentiment analysis pipeline
device = 0 if torch.cuda.is_available() else -1
bert_classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=device)

def bert_sentiment(text):
    result = bert_classifier(text[:512])[0]  # Truncate to 512 tokens
    label = result['label']
    if '1' in label or '2' in label:
        return 'negative'
    elif '3' in label:
        return 'neutral'
    else:
        return 'positive'

# apply BERT sentiment
df_TM_reviews['sentiment_bert'] = df_TM_reviews['comments_tokens'].apply(bert_sentiment)

# Save results to a new Excel file
output_file = 'df_reviews_with_sentiment2.xlsx'
df_TM_reviews.to_excel(output_file, index=False)

print(f"Sentiment analysis saved to {output_file}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Sentiment analysis saved to df_reviews_with_sentiment2.xlsx


# Comparing Results

In [None]:
# compare sentiment predictions across methods
comparison_df = df_TM_reviews[['comments_tokens', 'sentiment_label', 'sentiment_bert']]

# Summary stats
summary = comparison_df[['sentiment_label', 'sentiment_bert']].apply(pd.Series.value_counts)
print(summary)

# Agreement rate
comparison_df['agreement'] = (
    (comparison_df['sentiment_label'] == comparison_df['sentiment_bert'])
)
print(f"Agreement across methods: {comparison_df['agreement'].mean() * 100:.2f}%")

          sentiment_label  sentiment_bert
positive            36215           36348
neutral              1795            1678
negative             1272            1256
Agreement across methods: 89.73%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comparison_df['agreement'] = (
