# Data import

In [22]:
import pandas as pd
df = pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [24]:
df.describe()

Unnamed: 0,Rating
count,20491.0
mean,3.952223
std,1.23303
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [25]:
df.isnull().sum()

Review    0
Rating    0
dtype: int64

# sentiment function 

In [37]:
import numpy as np
def create_sentiment(rating): 
    if rating==1 or rating==2:
        return -1 # negative sentiment
    elif rating==4 or rating==5:
        return 1 # positive sentiment
    else:
        return 0 # neutral sentiment
df['Sentiment'] = df['Rating'].apply(create_sentiment)
print(df)

                                                  Review  Rating  Sentiment
0      nice hotel expensive parking got good deal sta...       4          1
1      ok nothing special charge diamond member hilto...       2         -1
2      nice rooms not  experience hotel monaco seattl...       3          0
3      unique great stay wonderful time hotel monaco ...       5          1
4      great stay great stay went seahawk game awesom...       5          1
...                                                  ...     ...        ...
20486  best kept secret rd time staying charm not sta...       5          1
20487  great location price view hotel great quick pl...       4          1
20488  ok just looks nice modern outside desk staff n...       2         -1
20489  hotel theft ruined vacation hotel opened sept ...       1         -1
20490  people talking ca nt believe excellent ratings...       2         -1

[20491 rows x 3 columns]


# Main code with model testing 

In [48]:
from sklearn.feature_extraction.text import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
df = pd.read_csv('tripadvisor_hotel_reviews.csv') 
def create_sentiment(rating):
    
    res = 0 # neutral sentiment
    
    if rating==1 or rating==2:
        res = -1 # negative sentiment
    elif rating==4 or rating==5:
        res = 1 # positive sentiment
        
    return res
df['Sentiment'] = df['Rating'].apply(create_sentiment)
def clean_data(review):
    
    no_punc = re.sub(r'[^\w\s]', '', review)
    no_digits = ''.join([i for i in no_punc if not i.isdigit()])
    
    return(no_digits)
df['Review'] = df['Review'].apply(clean_data)
tfidf = TfidfVectorizer(strip_accents=None, 
                        lowercase=False,
                        preprocessor=None)
X = tfidf.fit_transform(df['Review'])
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y)
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train,y_train)
preds = lr.predict(X_test)
accuracy_score(preds,y_test)
print(df)

                                                  Review  Rating  Sentiment
0      nice hotel expensive parking got good deal sta...       4          1
1      ok nothing special charge diamond member hilto...       2         -1
2      nice rooms not  experience hotel monaco seattl...       3          0
3      unique great stay wonderful time hotel monaco ...       5          1
4      great stay great stay went seahawk game awesom...       5          1
...                                                  ...     ...        ...
20486  best kept secret rd time staying charm not sta...       5          1
20487  great location price view hotel great quick pl...       4          1
20488  ok just looks nice modern outside desk staff n...       2         -1
20489  hotel theft ruined vacation hotel opened sept ...       1         -1
20490  people talking ca nt believe excellent ratings...       2         -1

[20491 rows x 3 columns]


# Accuracy of the model

In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(preds,y_test) # 0.86

0.8561389810657818