In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...


True

In [4]:
df = pd.read_csv('tripadvisor.csv')
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


# Data Cleaning and Preprocessing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [6]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [7]:
def clean_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

In [8]:
df['Cleaned_Review'] = df['Review'].apply(clean_text)

## Tokenization, Stop Word Removal, and Lemmatization

In [9]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

In [11]:
df['Processed_Review'] = df['Cleaned_Review'].apply(preprocess_text)

In [12]:
df[['Review', 'Processed_Review', 'Rating']].head()

Unnamed: 0,Review,Processed_Review,Rating
0,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,nice room experience hotel monaco seattle good...,3
3,"unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monaco ...,5
4,"great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...,5


# Model Training and Evaluation

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

## Split the data into training and testing sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['Processed_Review'], df['Rating'], test_size=0.2, random_state=42)

## Model Evaluation Function

In [19]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    print(f"\nResults for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, predictions))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
    return model_name, accuracy, f1

In [16]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [17]:
models = [
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Naive Bayes', MultinomialNB())
]

In [20]:
results = []

for name, clf in models:
    print(f"\nEvaluating {name}...")
    pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('classifier', clf)
    ])
    model_name, accuracy, f1 = evaluate_model(pipeline, X_train, X_test, y_train, y_test, name)
    results.append((model_name, accuracy, f1))
    print(f"Completed {name}\n")


Evaluating Random Forest...

Results for Random Forest:
Accuracy: 0.5435
F1 Score: 0.4752

Classification Report:
               precision    recall  f1-score   support

           1       0.77      0.51      0.61       292
           2       0.35      0.05      0.08       333
           3       0.56      0.02      0.04       432
           4       0.43      0.33      0.37      1252
           5       0.57      0.92      0.70      1790

    accuracy                           0.54      4099
   macro avg       0.53      0.36      0.36      4099
weighted avg       0.52      0.54      0.48      4099


Confusion Matrix:
 [[ 148   19    1   35   89]
 [  37   16    6  132  142]
 [   7    8   10  223  184]
 [   0    2    1  408  841]
 [   0    1    0  143 1646]]
Completed Random Forest


Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Results for Logistic Regression:
Accuracy: 0.6319
F1 Score: 0.6169

Classification Report:
               precision    recall  f1-score   support

           1       0.77      0.63      0.69       292
           2       0.47      0.40      0.43       333
           3       0.46      0.26      0.33       432
           4       0.55      0.51      0.53      1252
           5       0.70      0.85      0.77      1790

    accuracy                           0.63      4099
   macro avg       0.59      0.53      0.55      4099
weighted avg       0.62      0.63      0.62      4099


Confusion Matrix:
 [[ 184   69   11   12   16]
 [  39  132   71   58   33]
 [  10   59  111  203   49]
 [   5   20   43  644  540]
 [   2    2    6  261 1519]]
Completed Logistic Regression


Evaluating Support Vector Machine...

Results for Support Vector Machine:
Accuracy: 0.6380
F1 Score: 0.6223

Classification Report:
               precision    recall  f1-score   support

           1       0.80      0.59    

In [21]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score'])

# Final Evaluation Results

In [22]:
results_df

Unnamed: 0,Model,Accuracy,F1 Score
0,Random Forest,0.543547,0.475154
1,Logistic Regression,0.631861,0.616881
2,Support Vector Machine,0.63796,0.62231
3,Naive Bayes,0.552086,0.496863


# Summary and Insights
### Word Importance: Through TF-IDF, we identified that certain words like "great", "excellent", and "wonderful" were strongly associated with higher ratings, while words like "bad", "poor", and "terrible" were linked to lower ratings.
### Sentiment and Ratings Correlation: Positive reviews often correlated with higher star ratings, indicating a clear link between the sentiment expressed in text and the numerical rating.
### Common Issues Highlighted: Lower-rated reviews frequently mentioned issues related to customer service, room cleanliness, and noise levels, pointing out common areas for hotel management to address.
### Diverse Experiences: Reviews with neutral or mid-level ratings (2-3 stars) often included mixed sentiments, reflecting both positive and negative aspects of the stay, suggesting that these customers had a balanced view but found some shortcomings.