# **Topic Modelling Validation**

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import adjusted_rand_score, jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
import random
import matplotlib.pyplot as plt

In [7]:
# Load Dataset
df = pd.read_csv('/content/labelled_data.csv')

In [49]:
# Vectorize Text Data
count_vectorizer = CountVectorizer(max_df=0.95, min_df=5, ngram_range=(1, 1))
X_counts = count_vectorizer.fit_transform(df['spacy_tokens'])
feature_names = count_vectorizer.get_feature_names_out()

In [50]:
# Randomly Select 50 Reviews
random.seed(42)
subset_reviews = df.sample(n=50, random_state=42).reset_index(drop=True)

In [51]:
# Exclude these reviews from the dataset for training
remaining_reviews = df[~df.index.isin(subset_reviews.index)]

In [127]:
# Apply LDA Topic Modeling
optimal_k = 16  # Set based on prior optimization using coherence score
lda_model = LatentDirichletAllocation(n_components=optimal_k, random_state=42, learning_method='batch')
lda_model.fit(X_counts)


In [128]:
# Assign Topics to the Selected Reviews
subset_counts = count_vectorizer.transform(subset_reviews['spacy_tokens'])
subset_topics_probabilities = lda_model.transform(subset_counts)
subset_assigned_topics = subset_topics_probabilities.argmax(axis=1)

In [129]:
# Display Topics for Manual Interpretation
def display_topics(model, feature_names, no_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print("\n")

display_topics(lda_model, feature_names, 10)

Topic 1:
hotel, room, one, two, staff, four, five, service, thousand, day


Topic 2:
place, food, staff, amazing, best, stay, service, hotel, room, friendly


Topic 3:
ella, view, room, breakfast, tuk, walk, town, great, train, clean


Topic 4:
stay, staff, hotel, perfect, galle, room, fort, highly, recommend, view


Topic 5:
room, hotel, night, breakfast, would, one, staff, sri, food, two


Topic 6:
hotel, room, staff, good, view, nice, food, clean, friendly, great


Topic 7:
beach, clean, room, place, nice, good, great, stay, staff, friendly


Topic 8:
place, room, nice, good, really, stay, breakfast, family, night, tuk


Topic 9:
kandy, hotel, view, night, room, sri, stayed, pool, one, lanka


Topic 10:
room, hotel, bed, bathroom, water, one, good, night, shower, two


Topic 11:
place, booking, one, hotel, even, like, day, stay, time, booked


Topic 12:
pool, beach, room, view, stay, ocean, staff, breakfast, hotel, made


Topic 13:
great, hotel, food, good, staff, pool, friendly, be

In [130]:
# Map Topics to Aspect Labels (Manual Mapping)
topic_to_aspect = {
    1: "Accommodation",
    2: "Dining & Service",
    3: "Scenic Views & Activities",
    4: "Recommendations & Hospitality",
    5: "Food & Room Facilities",
    6: "Cleanliness & Hotel Experience",
    7: "Beach & Coastal Stay",
    8: "Family Stay & Comfort",
    9: "Scenic Locations & Pool",
    10: "Bathroom Facilities",
    11: "Booking & Stay Experience",
    12: "Hotel Amenities",
    13: "Food & Dining",
    14: "Local Culture",
    15: "Customer Service",
    16: "Nature & Wellness"
}
predicted_aspects = [topic_to_aspect[topic + 1] for topic in subset_assigned_topics]


In [80]:
# Manually Label the Reviews
manual_labels = []
for review in subset_reviews['review_text']:
    print(f"Review: {review}")
    label = input("Manual aspect label(s): ")
    manual_labels.append(label.split(','))  # Allow multiple labels

Review: best hotel visited free time good hospitality good ambiance nice view room good working room heater ancient breakfast clean tidy friendly staff close kandy city strongly recommend
Manual aspect label(s): Accommodation, Scenic Views & Activities, Food & Room Facilities, Customer Service
Review: stayed six night lived like prince boutique hotel virtually place food fantastic pool alone spacious well appointed room outdoor spa host umesh quiet gracious helped u book day trip sigiriya local transport also came chocolate ice cream dessert day day contact breakfast dinner beer local advice danushka mahesh give man pay rise service excellent
Manual aspect label(s): Accommodation, Dining & Service, Hotel Amenities, Customer Service
Review: arrived two little kid warmly welcomed cabana food great recommend place especially active one sea rough matter anyway pool definitely perfect spot surfing
Manual aspect label(s): Beach & Coastal Stay, Dining & Service, Family Stay & Comfort
Review: 

In [131]:
# Add the manual labels to the DataFrame
subset_reviews['manual_labels'] = manual_labels

# Save the DataFrame to a CSV file
output_file = '/content/manual_labels.csv' 
subset_reviews.to_csv(output_file, index=False)

In [132]:
# Compare Predicted and Manual Labels
subset_reviews['manual_labels'] = manual_labels
subset_reviews['predicted_labels'] = predicted_aspects

In [133]:
# Compute Validation Metrics
# Flatten labels for metrics computation
mlb = MultiLabelBinarizer()
manual_labels_binarized = mlb.fit_transform(subset_reviews['manual_labels'])
predicted_labels_binarized = mlb.transform([[aspect] for aspect in subset_reviews['predicted_labels']])



In [134]:
# Adjusted Rand Index (ARI)
ari_score = adjusted_rand_score(manual_labels_binarized.argmax(axis=1), predicted_labels_binarized.argmax(axis=1))
print(f"Adjusted Rand Index (ARI): {ari_score}")


Adjusted Rand Index (ARI): 0.028467907705896363


In [135]:
# Jaccard Coefficient (for multi-label comparison)
jaccard_scores = []
for i in range(len(manual_labels_binarized)):
    jaccard_scores.append(jaccard_score(manual_labels_binarized[i], predicted_labels_binarized[i]))
jaccard_average = np.mean(jaccard_scores)
print(f"Jaccard Coefficient (Average): {jaccard_average}")

Jaccard Coefficient (Average): 0.056666666666666664
