# Trip Advisor Hotel Reviews

#### **Data:** Describe the dataset (TripAdvisor hotel reviews) and its key features (review text, ratings).

#### Necessary Libraries

In [31]:
import spacy
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
from tqdm.notebook import tqdm
import re

In [29]:
nlp = spacy.load("en_core_web_sm")

In [7]:
data = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")
data.head(4)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5


### Data Preorocessing

In [27]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return tokens

In [33]:
tqdm(data['Review'].apply(preprocess_text))

  0%|          | 0/20491 [00:00<?, ?it/s]

<tqdm.notebook.tqdm_notebook at 0x7b2a9bc71120>

## Text Vectorisation

In [34]:
# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['Review'])

## Tpoic Modeling

In [37]:
# LDA for Topic Modeling
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)

In [38]:
for i, topic in enumerate(lda.components_):
    print(f"Topic {i + 1}:")
    print([vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])

Topic 1:
['nice', 'service', 'rooms', 'stayed', 'location', 'staff', 'stay', 'room', 'great', 'hotel']
Topic 2:
['people', 'time', 'no', 'pool', 'room', 'food', 'did', 'resort', 'beach', 'not']
Topic 3:
['nice', 'stay', 'clean', 'not', 'breakfast', 'great', 'good', 'location', 'room', 'hotel']


In [40]:
# Assign topics to reviews
topic_results = lda.transform(X)
data['Topic'] = topic_results.argmax(axis=1)

In [42]:
# Map topics to sentiment labels 
topic_labels = {0: 'Neutral', 1: 'Negative', 2: 'Positive'}
data['Sentiment'] = data['Topic'].map(topic_labels)

In [43]:
data

Unnamed: 0,Review,Rating,Topic,Sentiment
0,nice hotel expensive parking got good deal sta...,4,2,Positive
1,ok nothing special charge diamond member hilto...,2,1,Negative
2,nice rooms not 4* experience hotel monaco seat...,3,1,Negative
3,"unique, great stay, wonderful time hotel monac...",5,0,Neutral
4,"great stay great stay, went seahawk game aweso...",5,1,Negative
...,...,...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5,2,Positive
20487,great location price view hotel great quick pl...,4,2,Positive
20488,"ok just looks nice modern outside, desk staff ...",2,2,Positive
20489,hotel theft ruined vacation hotel opened sept ...,1,1,Negative


In [53]:
data['Review'].iloc[-3]

"ok just looks nice modern outside, desk staff n't particularly friendly, corridors dark smelt steam cleaned carpet maybe good thing, hotel right space needle thrilling pulled mins walk away, unfortunately room rear view buildings parking lot, rooms space needle view.there no hotel laundry suprising larger hotel.hotel restaurant mistakes menu ordering new guy think numerous items left tray room service breakfast, phone promptly delivered,  "

## Checking Our model

In [50]:
import random

# Define function to classify sentiments based on ratings
def classify_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating == 3:
        return "neutral"
    else:
        return "negative"

# Add a sentiment column to the dataframe
data["Sentiment"] = data["Rating"].apply(classify_sentiment)

# Function to sample a review safely
def sample_review(df, sentiment):
    if not data[data["Sentiment"] == sentiment].empty:
        return data[data["Sentiment"] == sentiment].sample(1, random_state=42)["Review"].values[0]
    else:
        return f"No {sentiment} reviews available."

# Randomly sample one review from each sentiment
positive_review = sample_review(data, "positive")
neutral_review = sample_review(data, "neutral")
negative_review = sample_review(data, "negative")

# Print the reviews
print("Positive Review:", positive_review)
print('......')
print("Neutral Review:", neutral_review)
print('......')
print("Negative Review:", negative_review)


Positive Review: perfect business travel pleasure, magnolia hotel perfect, town city-wide event, 20 staying magnolia 5 days, group sales team superb, attentive accomodating organized polite rarety planning group travel days, ca n't say loved look feel hotel, located heart downtown walking distance desireable location dallas did n't walk hotel provides courtesy shuttle, starbucks attached hotel, complimentary breakfast standard morning foods coffee day fresh baked cookies evening guests tickets good 2 fee drinks hotel bar night stay, evening room service coordinate fb needs daytime, mentioned great valets evening desk team, people really make place, rates reasonable boardroom suite, boardroom suites check website, magnificent, plenty room meetings day w/a kitchen, living area 1 bedroom, 1br suites gorgeous, rooms great, did n't stay historic floors bet splendid major bonus entertainment food, nights magnolia entertainment provided city square, night breakfast tiffany night concert, 3 4 