# Sentiment Processing

This notebook applies natural language processing techniques to customer
review text in order to extract sentiment information. Review-level sentiment
scores are aggregated at a monthly hotel level and merged with clean
operational hotel data for subsequent analysis.


In [1]:
# Load the customer review dataset to initiate sentiment analysis

import pandas as pd
reviews = pd.read_csv("reviews.csv")
reviews.head()


Unnamed: 0,ReviewID,HotelID,City,ReviewDate,Rating,ReviewText
0,1,14,Rome,15/09/2023,3,average experience
1,2,37,Berlin,22/10/2022,5,would definitely return
2,3,27,Madrid,20/04/2022,1,would not recommend
3,4,29,London,24/04/2022,4,amazing location
4,5,6,London,19/09/2023,5,would definitely return


In [2]:
# Inspect dataset structure and data types to verify successful ingestion
reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2116 entries, 0 to 2115
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ReviewID    2116 non-null   int64 
 1   HotelID     2116 non-null   int64 
 2   City        2116 non-null   object
 3   ReviewDate  2116 non-null   object
 4   Rating      2116 non-null   int64 
 5   ReviewText  2116 non-null   object
dtypes: int64(3), object(3)
memory usage: 99.3+ KB


In [3]:
# Install and initialise the VADER sentiment model for polarity scoring
!pip install vaderSentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [4]:
# Compute review-level sentiment scores to quantify guest perception
def get_sentiment(text):
    return analyzer.polarity_scores(str(text))["compound"]

reviews["SentimentScore"] = reviews["ReviewText"].apply(get_sentiment)

reviews[["ReviewText", "SentimentScore"]].head()


Unnamed: 0,ReviewText,SentimentScore
0,average experience,0.0
1,would definitely return,0.4019
2,would not recommend,-0.2755
3,amazing location,0.5859
4,would definitely return,0.4019


In [5]:
# Derive monthly periods from review dates to align sentiment with operational KPIs
reviews["ReviewDate"] = pd.to_datetime(reviews["ReviewDate"])
reviews["Month"] = reviews["ReviewDate"].dt.to_period("M").astype(str)

reviews[["HotelID", "Month", "SentimentScore"]].head()


  reviews["ReviewDate"] = pd.to_datetime(reviews["ReviewDate"])


Unnamed: 0,HotelID,Month,SentimentScore
0,14,2023-09,0.0
1,37,2022-10,0.4019
2,27,2022-04,-0.2755
3,29,2022-04,0.5859
4,6,2023-09,0.4019


In [6]:
# Aggregate review sentiment at the hotel-month level to support modelling features
monthly_sentiment = reviews.groupby(
    ["HotelID", "Month"]
).agg(
    MeanSentiment=("SentimentScore", "mean"),
    ReviewCount=("SentimentScore", "count")
).reset_index()

monthly_sentiment.head()


Unnamed: 0,HotelID,Month,MeanSentiment,ReviewCount
0,1,2022-01,0.279233,3
1,1,2022-02,0.05652,5
2,1,2022-03,0.39,3
3,1,2022-04,0.4136,2
4,1,2022-05,-0.5095,2


In [7]:
# Load the cleaned operational dataset for integration with sentiment metrics
df = pd.read_csv("/content/hotel_data_clean.csv")
df.head()


Unnamed: 0,HotelID,Month,OccupiedRoomNights,MonthStart,DaysInMonth,TotalRooms,OccupancyRate,ADR,CancellationRate
0,1,2022-01,21,2022-01-01,31,122,0.222105,243.704286,0.0
1,1,2022-02,18,2022-02-01,28,122,0.210773,215.478333,0.333333
2,1,2022-03,3,2022-03-01,31,122,0.031729,219.2325,0.5
3,1,2022-04,19,2022-04-01,30,122,0.20765,213.618571,0.428571
4,1,2022-05,9,2022-05-01,31,122,0.095188,213.473333,0.333333


In [8]:
# Generate descriptive statistics to validate aggregated sentiment behaviour
monthly_sentiment.describe()


Unnamed: 0,HotelID,MeanSentiment,ReviewCount
count,846.0,846.0,846.0
mean,20.466903,0.284004,2.501182
std,11.553968,0.275906,1.328566
min,1.0,-0.5423,1.0
25%,11.0,0.133012,2.0
50%,20.5,0.316038,2.0
75%,31.0,0.494075,3.0
max,40.0,0.762,8.0


In [9]:
# Reload operational dataset to ensure a clean merge baseline
ops = pd.read_csv("hotel_data_clean.csv")


In [10]:
# Merge operational and sentiment datasets using hotel-month keys
final_df = ops.merge(
    monthly_sentiment,
    on=["HotelID", "Month"],
    how="left"
)


In [11]:
# Validate merged feature distributions prior to cleaning
final_df[["OccupancyRate", "ADR", "CancellationRate", "MeanSentiment", "ReviewCount"]].describe()


Unnamed: 0,OccupancyRate,ADR,CancellationRate,MeanSentiment,ReviewCount
count,962.0,952.0,952.0,824.0,824.0
mean,0.19116,205.394496,0.219006,0.28279,2.51699
std,0.124924,40.806526,0.183896,0.275889,1.329981
min,0.005538,120.57,0.0,-0.5423,1.0
25%,0.103295,176.4195,0.0,0.133237,2.0
50%,0.165937,206.12925,0.2,0.315858,2.0
75%,0.249099,226.52,0.333333,0.4939,3.0
max,0.880696,297.69,1.0,0.762,8.0


In [12]:
ops = pd.read_csv("hotel_data_clean.csv")


In [13]:
final_df = ops.merge(
    monthly_sentiment,
    on=["HotelID", "Month"],
    how="left"
)


In [14]:
# Replace missing sentiment indicators with zero to preserve temporal continuity
final_df["MeanSentiment"] = final_df["MeanSentiment"].fillna(0)
final_df["ReviewCount"] = final_df["ReviewCount"].fillna(0)


In [15]:
# Re-check feature distributions after imputation
final_df[["OccupancyRate", "ADR", "CancellationRate", "MeanSentiment", "ReviewCount"]].describe()


Unnamed: 0,OccupancyRate,ADR,CancellationRate,MeanSentiment,ReviewCount
count,962.0,952.0,952.0,962.0,962.0
mean,0.19116,205.394496,0.219006,0.242224,2.155925
std,0.124924,40.806526,0.183896,0.2739,1.514622
min,0.005538,120.57,0.0,-0.5423,0.0
25%,0.103295,176.4195,0.0,0.0,1.0
50%,0.165937,206.12925,0.2,0.27065,2.0
75%,0.249099,226.52,0.333333,0.448608,3.0
max,0.880696,297.69,1.0,0.762,8.0


In [16]:
# Remove records with invalid ADR values to maintain modelling reliability
final_df = final_df.dropna(subset=["ADR"])


In [17]:
# Confirm absence of remaining ADR null values
final_df["ADR"].isna().sum()


np.int64(0)

In [18]:
# Verify final dataset dimensions
final_df.shape


(952, 11)

In [19]:
# Perform final descriptive validation of modelling features
final_df[
    ["OccupancyRate", "ADR", "CancellationRate", "MeanSentiment", "ReviewCount"]
].describe()


Unnamed: 0,OccupancyRate,ADR,CancellationRate,MeanSentiment,ReviewCount
count,952.0,952.0,952.0,952.0,952.0
mean,0.192864,205.394496,0.219006,0.243922,2.169118
std,0.124448,40.806526,0.183896,0.273278,1.514657
min,0.005538,120.57,0.0,-0.5423,0.0
25%,0.10528,176.4195,0.0,0.0,1.0
50%,0.166667,206.12925,0.2,0.270763,2.0
75%,0.251464,226.52,0.333333,0.449395,3.0
max,0.880696,297.69,1.0,0.762,8.0


In [20]:
# Export the modelling-ready dataset with engineered sentiment features
final_df.to_csv("final_model_dataset.csv", index=False)
