### <font color="green">Building Aspect-Based Sentiment Analysis Models</font>

In [1]:
# Read preprocessed dataset from the VLSP 2018 contest (VABSA 2018) for inference label.
import pandas as pd
data = pd.read_excel("VLSP_Processed.xlsx")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5062 entries, 0 to 5061
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   reviews                5062 non-null   object
 1   HOTEL#GENERAL          5062 non-null   int64 
 2   LOCATION#GENERAL       5062 non-null   int64 
 3   SERVICE#GENERAL        5062 non-null   int64 
 4   ROOMS#DESIGN&FEATURES  5062 non-null   int64 
 5   ROOMS#CLEANLINESS      5062 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 237.4+ KB


`Define function for text preprocessing`

In [2]:
import regex as re
from pyvi import ViTokenizer

def remove_emoji(text):
    # Define the pattern for matching emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    # Remove emojis from the text
    return emoji_pattern.sub(r'', text)

def word_tokenize(text):
	text = ViTokenizer.tokenize(text)
	return text

def lower_text(text):
	return text.lower()

def remove_special_character(text):
	text = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',' ',text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

def text_preprocess(text):
    text = remove_emoji(text)
    text = word_tokenize(text)
    text = lower_text(text)
    text = remove_special_character(text)

    return text

In [3]:
# Assign independent(input) - X & dependent(output) variables - y.
X = data["reviews"].apply(text_preprocess) # Applying text processed for training
y = data.iloc[:, 1:6]

In [4]:
print(X.head(3))
print(y.head(3))

0    rộng_rãi ks mới nhưng rất vắng các dịch_vụ chấ...
1    địa_điểm thuận_tiện trong vòng bán_kính 1 5 km...
2                              phục_vụ view đẹp vị_trí
Name: reviews, dtype: object
   HOTEL#GENERAL  LOCATION#GENERAL  SERVICE#GENERAL  ROOMS#DESIGN&FEATURES  \
0              1                 0                0                      0   
1              0                 3                0                      0   
2              3                 3                3                      0   

   ROOMS#CLEANLINESS  
0                  0  
1                  0  
2                  0  


`Import necessary libraries for Building & Training Models`

In [5]:
# from sklearn.model_selection import train_test_split.
# from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
import joblib

`Build & Fine tune SVM models using GridSearchCV`

In [6]:
def train_and_evaluate(X, y_aspect_binary, y_aspect_sentiment, aspect_name):
    """
    Train, fine-tune, and evaluate models for aspect detection and sentiment classification.
    - X: Preprocessed review texts.
    - y_aspect_binary: Binary labels for aspect detection (mentioned vs. not mentioned).
    - y_aspect_sentiment: Sentiment labels (polarity scores) for mentioned aspects.
    - aspect_name: The name of the current aspect.
    """
    # Define the parameter grid for fine-tuning
    param_grid = {
        'clf__C': [1, 10, 100], # Regularization parameter
        'clf__tol': [1e-4, 1e-3, 1e-2]  # Stopping criterion tolerance
    }
    
    # Aspect Detection Model Training and Fine-tuning
    aspect_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('clf', SVC(kernel='linear', probability=True))
    ])

    grid_search_aspect = GridSearchCV(aspect_pipeline, 
                                      param_grid, cv=3, 
                                      scoring=make_scorer(f1_score, average='binary'), 
                                      verbose=1)
    
    grid_search_aspect.fit(X, y_aspect_binary)  # Use entire dataset; GridSearchCV performs its own splitting
    print("---"*30)
    print(f"==> Best Aspect Detection Params for {aspect_name}: {grid_search_aspect.best_params_}")
    print(f"==> Best Aspect Detection F1-Score for {aspect_name}: {grid_search_aspect.best_score_}")
    print("---"*30)

    # Sentiment Classification Model Training and Fine-tuning
    # Filter out reviews where the aspect is not mentioned for sentiment classification
    mask = y_aspect_sentiment != 0
    X_filtered = X[mask]
    y_aspect_sentiment_filtered = y_aspect_sentiment[mask].map({1: 0, 2: 1, 3: 2})
    sentiment_pipeline = clone(grid_search_aspect.best_estimator_)  # Use best estimator from aspect detection
    
    grid_search_sentiment = GridSearchCV(sentiment_pipeline, 
                                         param_grid, cv=3, 
                                         scoring=make_scorer(f1_score, average='macro'), 
                                         verbose=1)
    
    grid_search_sentiment.fit(X_filtered, y_aspect_sentiment_filtered)
    print("---"*30)
    print(f"==> Best Sentiment Classification Params for {aspect_name}: {grid_search_sentiment.best_params_}")
    print(f"==> Best Sentiment Classification F1-Score for {aspect_name}: {grid_search_sentiment.best_score_}")
    print("---"*30)

    aspect_model_filename = f"{aspect_name}_aspect_model.pkl"
    sentiment_model_filename = f"{aspect_name}_sentiment_model.pkl"
    
    # Save the best models
    joblib.dump(grid_search_aspect.best_estimator_, aspect_model_filename)
    print(f"==> Saved best aspect detection model to {aspect_model_filename}")
    
    joblib.dump(grid_search_sentiment.best_estimator_, sentiment_model_filename)
    print(f"==> Saved best sentiment classification model to {sentiment_model_filename}")
    print("---"*30)
    return (grid_search_aspect.best_estimator_, grid_search_aspect.best_params_, grid_search_aspect.best_score_,
            grid_search_sentiment.best_estimator_, grid_search_sentiment.best_params_, grid_search_sentiment.best_score_)

aspect_models = {}
sentiment_models = {}
aspect_names = data.columns[1:6]

model_evaluation_info = []
# Train and evaluate models for each aspect
for aspect in aspect_names:
    y_aspect_binary = data[aspect].apply(lambda x: 1 if x > 0 else 0)
    y_aspect_sentiment = data[aspect]
    
    aspect_model, aspect_best_params, aspect_best_score, sentiment_model, sentiment_best_params, sentiment_best_score = train_and_evaluate(X, y_aspect_binary, y_aspect_sentiment, aspect)
    
    aspect_models[aspect] = aspect_model
    sentiment_models[aspect] = sentiment_model
    
    # Append model evaluation data for aspect detection
    model_evaluation_info.append({
        "ModelName": f"{aspect} Aspect Detection",
        "BestParams": aspect_best_params,
        "F1-score": aspect_best_score
    })
    
    # Append model evaluation data for sentiment classification
    model_evaluation_info.append({
        "ModelName": f"{aspect} Sentiment Classification",
        "BestParams": sentiment_best_params,
        "F1-score": sentiment_best_score
    })

# Convert the list of evaluation data into a DataFrame
ModelEvaluation = pd.DataFrame(model_evaluation_info)

# Display the Evaluation Info for each model
ModelEvaluation.head(10)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
------------------------------------------------------------------------------------------
==> Best Aspect Detection Params for HOTEL#GENERAL: {'clf__C': 1, 'clf__tol': 0.001}
==> Best Aspect Detection F1-Score for HOTEL#GENERAL: 0.7101091942438039
------------------------------------------------------------------------------------------
Fitting 3 folds for each of 9 candidates, totalling 27 fits
------------------------------------------------------------------------------------------
==> Best Sentiment Classification Params for HOTEL#GENERAL: {'clf__C': 10, 'clf__tol': 0.01}
==> Best Sentiment Classification F1-Score for HOTEL#GENERAL: 0.7091176410371021
------------------------------------------------------------------------------------------
==> Saved best aspect detection model to HOTEL#GENERAL_aspect_model.pkl
==> Saved best sentiment classification model to HOTEL#GENERAL_sentiment_model.pkl
-----------------------------

Unnamed: 0,ModelName,BestParams,F1-score
0,HOTEL#GENERAL Aspect Detection,"{'clf__C': 1, 'clf__tol': 0.001}",0.710109
1,HOTEL#GENERAL Sentiment Classification,"{'clf__C': 10, 'clf__tol': 0.01}",0.709118
2,LOCATION#GENERAL Aspect Detection,"{'clf__C': 1, 'clf__tol': 0.0001}",0.879895
3,LOCATION#GENERAL Sentiment Classification,"{'clf__C': 10, 'clf__tol': 0.0001}",0.544776
4,SERVICE#GENERAL Aspect Detection,"{'clf__C': 1, 'clf__tol': 0.0001}",0.949056
5,SERVICE#GENERAL Sentiment Classification,"{'clf__C': 10, 'clf__tol': 0.01}",0.598633
6,ROOMS#DESIGN&FEATURES Aspect Detection,"{'clf__C': 10, 'clf__tol': 0.0001}",0.674546
7,ROOMS#DESIGN&FEATURES Sentiment Classification,"{'clf__C': 10, 'clf__tol': 0.0001}",0.576071
8,ROOMS#CLEANLINESS Aspect Detection,"{'clf__C': 1, 'clf__tol': 0.0001}",0.655167
9,ROOMS#CLEANLINESS Sentiment Classification,"{'clf__C': 10, 'clf__tol': 0.0001}",0.562258


`Define function for Data prediction`

In [8]:
def analyze_review(review, aspect_models, sentiment_models):
    """
    Analyze a single review to detect mentioned aspects and classify sentiment for each aspect.
    - review: The text of the review to analyze.
    - aspect_models: A dictionary of trained SVM models for aspect detection.
    - sentiment_models: A dictionary of trained SVM models for sentiment classification.
    """
    preprocessed_text = text_preprocess(review)  # Apply text preprocessing
    result = {}
    
    for aspect, model in aspect_models.items():
        # Predict if the aspect is mentioned
        is_mentioned = model.predict([preprocessed_text])[0]
        if is_mentioned == 1:  # If the aspect is detected
            # Predict sentiment for the detected aspect
            sentiment_prediction = sentiment_models[aspect].predict([preprocessed_text])[0]
            sentiment_label = ['Negative', 'Neutral', 'Positive'][sentiment_prediction]
            result[aspect] = sentiment_label
    return result

In [84]:
# Try to predict an unseen review.
review = """
Phòng ốc sạch sẽ, thoáng mát, vị trí gần cầu Rồng, nhưng thái độ nhân viên kém, thấy khách
nhiều hành lý nhưng không phụ giúp mang hành lý.
"""
analysis_result = analyze_review(review, aspect_models, sentiment_models)
print("Model input:", review)
print("Model output: ==>", "; ".join([f"{aspect}: {sentiment}" for aspect, sentiment in analysis_result.items()]))

Model input: 
Phòng ốc sạch sẽ, thoáng mát, vị trí gần cầu Rồng, nhưng thái độ nhân viên kém, thấy khách
nhiều hành lý nhưng không phụ giúp mang hành lý.

Model output: ==> LOCATION#GENERAL: Positive; SERVICE#GENERAL: Negative; ROOMS#CLEANLINESS: Positive


### <font color="green">Sentiment labelling on real data scraping from booking.com</font>

In [46]:
realdata = pd.read_excel("F_DaNangHotelReviewsVi.xlsx")
realdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17032 entries, 0 to 17031
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   hotel_name  17032 non-null  object
 1   review      17032 non-null  object
 2   language    17032 non-null  object
dtypes: object(3)
memory usage: 399.3+ KB


In [47]:
realdata.drop(realdata.columns[2], axis=1, inplace=True)
realdata.head(2)

Unnamed: 0,hotel_name,review
0,Florence Hotel,"NV rất dễ thương, nhiệt tình. Phòng đẹp, tiện ..."
1,Florence Hotel,Tất cả đễu tuyệt vời😁


In [48]:
# Keep rows where the length of the review is greater than or equal to 10.
realdata_filtered = realdata[realdata['review'].apply(lambda x: len(x) >= 10)]
# Reset index
realdata_filtered.reset_index(drop=True, inplace=True)
realdata_filtered.shape

(15305, 2)

In [49]:
# For each hotel, keep the first 50 reviews for web interface demo.
realdata_50_reviews = realdata_filtered.groupby('hotel_name').head(50).reset_index(drop=True)
realdata_50_reviews.shape

(9686, 2)

`Aspect-polarity labelling on real dataset scraping from booking.com`

In [50]:
realdata_50_reviews['aspect_sentiment_label'] = realdata_50_reviews['review'].apply(lambda x: analyze_review(x, aspect_models, sentiment_models))

In [51]:
realdata_50_reviews.head(30)

Unnamed: 0,hotel_name,review,aspect_sentiment_label
0,Florence Hotel,"NV rất dễ thương, nhiệt tình. Phòng đẹp, tiện ...","{'HOTEL#GENERAL': 'Positive', 'SERVICE#GENERAL..."
1,Florence Hotel,Tất cả đễu tuyệt vời😁,{'HOTEL#GENERAL': 'Positive'}
2,Florence Hotel,"Nhận phòng lúc 00h30 nhân viên vẫn nhiệt tình,...","{'SERVICE#GENERAL': 'Positive', 'ROOMS#DESIGN&..."
3,Florence Hotel,"Không thấy tiện ích xung quanh, chỉ có Lotte mark",{}
4,Florence Hotel,Điều gì cũng thích,{'HOTEL#GENERAL': 'Positive'}
5,Florence Hotel,"tiện nghi, thoải mái",{}
6,Florence Hotel,nv xử lí hơi lâu,{'SERVICE#GENERAL': 'Positive'}
7,Florence Hotel,"Chị lễ tân ở đây dễ thương lắm, cả anh phục vụ...","{'SERVICE#GENERAL': 'Positive', 'ROOMS#DESIGN&..."
8,Florence Hotel,"Tiếc là khách sạn ở hơi xa so với trung tâm, n...",{'LOCATION#GENERAL': 'Negative'}
9,Florence Hotel,Tiện ích và phù hợp giá tiền,{}


In [59]:
# Remove rows where 'aspect_sentiment_label' column contains {} (meaning model can't predict the label).
realdata_50_reviews_filtered = realdata_50_reviews[realdata_50_reviews['aspect_sentiment_label'].apply(lambda x: x != {})].reset_index(drop=True)

# Display the filtered DataFrame
realdata_50_reviews_filtered.head(30)

Unnamed: 0,hotel_name,review,aspect_sentiment_label
0,Florence Hotel,"NV rất dễ thương, nhiệt tình. Phòng đẹp, tiện ...","{'HOTEL#GENERAL': 'Positive', 'SERVICE#GENERAL..."
1,Florence Hotel,Tất cả đễu tuyệt vời😁,{'HOTEL#GENERAL': 'Positive'}
2,Florence Hotel,"Nhận phòng lúc 00h30 nhân viên vẫn nhiệt tình,...","{'SERVICE#GENERAL': 'Positive', 'ROOMS#DESIGN&..."
3,Florence Hotel,Điều gì cũng thích,{'HOTEL#GENERAL': 'Positive'}
4,Florence Hotel,nv xử lí hơi lâu,{'SERVICE#GENERAL': 'Positive'}
5,Florence Hotel,"Chị lễ tân ở đây dễ thương lắm, cả anh phục vụ...","{'SERVICE#GENERAL': 'Positive', 'ROOMS#DESIGN&..."
6,Florence Hotel,"Tiếc là khách sạn ở hơi xa so với trung tâm, n...",{'LOCATION#GENERAL': 'Negative'}
7,Florence Hotel,"chỗ ở sạch sẽ, nhân viên nhiệt tình, chu đáo","{'SERVICE#GENERAL': 'Positive', 'ROOMS#CLEANLI..."
8,Florence Hotel,vị trí xa bãi biển,{'LOCATION#GENERAL': 'Negative'}
9,Florence Hotel,"Địa điểm gần Lotte mart, phố ăn uống chợ đêm. ...",{'LOCATION#GENERAL': 'Positive'}


In [65]:
realdata_50_reviews_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7816 entries, 0 to 7815
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   hotel_name              7816 non-null   object
 1   review                  7816 non-null   object
 2   aspect_sentiment_label  7816 non-null   object
dtypes: object(3)
memory usage: 183.3+ KB


In [67]:
# Count reviews for each hotel after removing reviews that can't be predicted.
review_counts = realdata_50_reviews_filtered.groupby('hotel_name').size()
review_counts_sorted = review_counts.sort_values(ascending=False)
# Display the sorted counts
print(review_counts_sorted)

hotel_name
Seahorse Han Market Da Nang Hostel by Haviland    49
Animor Green Home Villa Da Nang                   48
OYO 1168 Elly's Home 2                            48
Continent Apartment                               48
Waikiki Beach Hotel                               47
                                                  ..
Taiyo Hotel & Apartment                            3
Danang Marriott Resort & Spa                       2
Peninsula Hotel Danang                             2
Eco Green Boutique Hotel Da Nang                   1
The Backpacker Hostel and spa                      1
Length: 259, dtype: int64


In [68]:
# realdata_50_reviews_filtered.to_excel("BookingHotelABSA.xlsx", index=False)