In [20]:
import pandas as pd

# The Dataset I Will Compare the 3 Different models with

In [21]:
# Loading the data.
df = pd.read_csv('../Examensarbete_DS21/20k_tripadvisor_hotel_reviews.csv')
df.shape

(20491, 2)

In [22]:
# Let's have a look at our dataset.
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [23]:
# Checking for null values.
df.isna().any()

Review    False
Rating    False
dtype: bool

In [24]:
# Summary of dataset.
df.describe()

Unnamed: 0,Rating
count,20491.0
mean,3.952223
std,1.23303
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [25]:
# Rating Distribution.
df['Rating'].value_counts()

Rating
5    9054
4    6039
3    2184
2    1793
1    1421
Name: count, dtype: int64

In [26]:
# If rating [1, 2, 3] = Negative and if rating [4, 5] = Positive
def ratings(rating):
    if rating>0 and rating<=3:
        return 0
    if rating>3 and rating<=5:
        return 1
    
df['Positive'] = df['Rating'].apply(ratings)
df.head()

Unnamed: 0,Review,Rating,Positive
0,nice hotel expensive parking got good deal sta...,4,1
1,ok nothing special charge diamond member hilto...,2,0
2,nice rooms not 4* experience hotel monaco seat...,3,0
3,"unique, great stay, wonderful time hotel monac...",5,1
4,"great stay great stay, went seahawk game aweso...",5,1


In [27]:
df['Character_Count'] = df['Review'].str.len()
df.head()

Unnamed: 0,Review,Rating,Positive,Character_Count
0,nice hotel expensive parking got good deal sta...,4,1,593
1,ok nothing special charge diamond member hilto...,2,0,1689
2,nice rooms not 4* experience hotel monaco seat...,3,0,1427
3,"unique, great stay, wonderful time hotel monac...",5,1,600
4,"great stay great stay, went seahawk game aweso...",5,1,1281


In [28]:
def word_count(review):
    review_list = review.split()
    return len(review_list)

df['Word_count'] = df['Review'].apply(word_count)
df.head()

Unnamed: 0,Review,Rating,Positive,Character_Count,Word_count
0,nice hotel expensive parking got good deal sta...,4,1,593,87
1,ok nothing special charge diamond member hilto...,2,0,1689,250
2,nice rooms not 4* experience hotel monaco seat...,3,0,1427,217
3,"unique, great stay, wonderful time hotel monac...",5,1,600,89
4,"great stay great stay, went seahawk game aweso...",5,1,1281,191


In [29]:
# Creates "id" column with unique ids.
df["id"] = df.index + 1

df.head()

Unnamed: 0,Review,Rating,Positive,Character_Count,Word_count,id
0,nice hotel expensive parking got good deal sta...,4,1,593,87,1
1,ok nothing special charge diamond member hilto...,2,0,1689,250,2
2,nice rooms not 4* experience hotel monaco seat...,3,0,1427,217,3
3,"unique, great stay, wonderful time hotel monac...",5,1,600,89,4
4,"great stay great stay, went seahawk game aweso...",5,1,1281,191,5


In [30]:
initial_processing_df = df

%store initial_processing_df

Stored 'initial_processing_df' (DataFrame)


# RNN Training Dataset

In [31]:
# Loading the data.
rnn_df = pd.read_csv('../Examensarbete_DS21/rnn_hotel_training_data.csv', encoding='ISO-8859-1')
rnn_df.shape

(27330, 6)

In [32]:
# Let's have a look at our dataset.
rnn_df.head()

Unnamed: 0,Property Name,Review Rating,Review Title,Review Text,Location Of The Reviewer,Date Of Review
0,Apex London Wall Hotel,5,Ottima qualità prezzo,Siamo stati a Londra per un week end ed abbiam...,"Casale Monferrato, Italy",10/20/2012
1,Corinthia Hotel London,5,"By far, my best hotel in the world",I had a pleasure of staying in this hotel for ...,"Savannah, Georgia",3/23/2016
2,The Savoy,5,First visit to the American Bar at the Savoy,A very lovely first visit to this iconic hotel...,London,7/30/2013
3,Rhodes Hotel,4,Nice stay,3 of us stayed at the Rhodes Hotel for 4 night...,"Maui, Hawaii",6/2/2012
4,The Savoy,5,Perfection,Form the moment we arrived until we left we ex...,"London, United Kingdom",11/24/2017


In [33]:
# We only need the review text and raiting
rnn_df = rnn_df[['Review Text', 'Review Rating']]
rnn_df

Unnamed: 0,Review Text,Review Rating
0,Siamo stati a Londra per un week end ed abbiam...,5
1,I had a pleasure of staying in this hotel for ...,5
2,A very lovely first visit to this iconic hotel...,5
3,3 of us stayed at the Rhodes Hotel for 4 night...,4
4,Form the moment we arrived until we left we ex...,5
...,...,...
27325,I come to London often but since I stayed in t...,5
27326,En cuarto que nos tocó no había toallas y habí...,3
27327,This is a quality quiet hotel located in an ex...,4
27328,"Väldigt vackra rum, tyvärr med en mycket höglj...",4


In [34]:
# Checking for null values.
rnn_df.isna().any()

Review Text      False
Review Rating    False
dtype: bool

In [35]:
# Summary of dataset.
rnn_df.describe()

Unnamed: 0,Review Rating
count,27330.0
mean,4.490999
std,0.891704
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [36]:
# If rating [1, 2, 3] = Negative and if rating [4, 5] = Positive
def ratings(rating):
    if rating>0 and rating<=3:
        return 0
    if rating>3 and rating<=5:
        return 1
    
rnn_df['Positive'] = rnn_df['Review Rating'].apply(ratings)
rnn_df.head()

Unnamed: 0,Review Text,Review Rating,Positive
0,Siamo stati a Londra per un week end ed abbiam...,5,1
1,I had a pleasure of staying in this hotel for ...,5,1
2,A very lovely first visit to this iconic hotel...,5,1
3,3 of us stayed at the Rhodes Hotel for 4 night...,4,1
4,Form the moment we arrived until we left we ex...,5,1


In [37]:
rnn_df['Character_Count'] = rnn_df['Review Text'].str.len()
rnn_df.head()

Unnamed: 0,Review Text,Review Rating,Positive,Character_Count
0,Siamo stati a Londra per un week end ed abbiam...,5,1,1063
1,I had a pleasure of staying in this hotel for ...,5,1,1425
2,A very lovely first visit to this iconic hotel...,5,1,324
3,3 of us stayed at the Rhodes Hotel for 4 night...,4,1,425
4,Form the moment we arrived until we left we ex...,5,1,363


In [38]:
def word_count(review):
    review_list = review.split()
    return len(review_list)

rnn_df['Word_count'] = rnn_df['Review Text'].apply(word_count)
rnn_df.head()

Unnamed: 0,Review Text,Review Rating,Positive,Character_Count,Word_count
0,Siamo stati a Londra per un week end ed abbiam...,5,1,1063,171
1,I had a pleasure of staying in this hotel for ...,5,1,1425,265
2,A very lovely first visit to this iconic hotel...,5,1,324,54
3,3 of us stayed at the Rhodes Hotel for 4 night...,4,1,425,75
4,Form the moment we arrived until we left we ex...,5,1,363,62


In [39]:
%store rnn_df

Stored 'rnn_df' (DataFrame)
