## Sentiment analysis of reviews mined from Tripadvisor. 
## Input dataset - https://www.kaggle.com/datasets/andrewmvd/trip-advisor-hotel-reviews

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
hotel_reviews = pd.read_csv('/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
hotel_reviews.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [3]:
hotel_reviews.shape

(20491, 2)

In [6]:
hotel_reviews.isnull().sum()

Review    0
Rating    0
dtype: int64

In [12]:
hotel_reviews[hotel_reviews.duplicated()]

Unnamed: 0,Review,Rating


In [11]:
hotel_reviews[hotel_reviews.duplicated('Review')]

Unnamed: 0,Review,Rating


### No duplicates, no null values in this dataset.

In [18]:
hotel_reviews.iloc[0,0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  '

In [23]:
hotel_reviews.Rating.unique()

array([4, 2, 3, 5, 1])

In [26]:
def create_sentiment(rating):
    if rating==4 or rating==5:
        Sentiment = 1
    elif rating==1 or rating==2:
        Sentiment = -1
    else:
        Sentiment = 0    
    return Sentiment 


hotel_reviews['Sentiment'] = hotel_reviews['Rating'].apply(create_sentiment)

In [27]:
hotel_reviews.head()

Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,1
1,ok nothing special charge diamond member hilto...,2,-1
2,nice rooms not 4* experience hotel monaco seat...,3,0
3,"unique, great stay, wonderful time hotel monac...",5,1
4,"great stay great stay, went seahawk game aweso...",5,1


In [28]:
from sklearn.feature_extraction.text import re

def clean_data(review):
    clean_review = re.sub(r'[^\w\s]',"",review)  #Remove punctuation
    clean_review = ''.join([i for i in clean_review if not i.isdigit()])   #Remove digits
    return clean_review

hotel_reviews['Review_Clean'] = hotel_reviews['Review'].apply(clean_data)

In [31]:
hotel_reviews['Review'][0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  '

In [30]:
hotel_reviews['Review_Clean'][0]

'nice hotel expensive parking got good deal stay hotel anniversary arrived late evening took advice previous reviews did valet parking check quick easy little disappointed nonexistent view room room clean nice size bed comfortable woke stiff neck high pillows not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway maybe just noisy neighbors aveda bath products nice did not goldfish stay nice touch taken advantage staying longer location great walking distance shopping overall nice experience having pay  parking night  '

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False,preprocessor=None)

X = tfidf.fit_transform(hotel_reviews['Review_Clean'])

In [36]:
from sklearn.model_selection import train_test_split

y = hotel_reviews['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y)


In [39]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [42]:
from sklearn.metrics import accuracy_score

print("Accuracy of the LR model is",round(accuracy_score(y_test,y_pred)*100,2),"%")

Accuracy of the LR model is 85.89 %
