# HOTEL REVIEWS NLP
## Group E 

In [None]:
import pandas as pd

serie = pd.read_csv('Hotel reviews.csv')
df = pd.DataFrame(serie)

In [None]:
# Keeping only the hotels for our business purpose
df = df[df['Establishment Type'] == 'Hotel']
del df['Establishment Type']
df.head()

#### Date Cleaning

In [None]:
df = df[~df['Review Date'].str.contains("ago")]
df = df[~df['Review Date'].str.contains("NEW")]
df = df.reset_index(drop = True)
df['Review Date'] = pd.to_datetime(df['Review Date'])

In [None]:
#Checking types of columns
df.dtypes

#### Location Cleaning

In [None]:
#Assigning a NULL to the empty cell, without doing it it is difficult to work on this column of lists
df.Location[df.Location.isnull() == True] = 'NULL' 
df.Location = df.Location.str.split(", ")

In [None]:
# Creating a column just with the city and not the country because we have a lot of missing values for the country so is relevant to save only the city
for i in range(0,len(df)):
        df.Location[i] = df.Location[i][0]
        

#### Reviewer Rank Cleaning

In [None]:
#Normalizing Review Rank but replacing NAs with Contributor 
df['Reviewer Rank'].fillna('Contributor', inplace=True)
df['Reviewer Rank'] = df['Reviewer Rank'].map({'Contributor': 1,'Reviewer': 2,\
                                               'Senior Contributor': 3,'Senior Reviewer': 4, 'Top Contributor': 5}).astype(int)

#### Expanding the Score Breakdown into 6 different columns: 
* `Value` 
* `Location` 
* `Sleep Quality`
* `Rooms` 
* `Cleanliness` 
* `Service` 

In [None]:
## creating a function to use in the below for loop
def sorter(text):
    text = text.strip()
    if text == "Value":
        return 0
    elif text == "Location":
        return 1
    elif text == "Sleep Quality":
        return 2
    elif text == "Rooms":
        return 3
    elif text == "Cleanliness":
        return 4
    elif text == "Service":
        return 5
    else:
        return 6
    

In [None]:
#Dropping NAs for the Score Breakdown column in order to continue with the filtering
df = df[df['Score Breakdown'].notna()].reset_index(drop = True)

#Creating an array for each value using the function mentioned above
import numpy as np
ma = np.zeros(shape = (len(df['Score Breakdown']), 7))

for i in range(0,len(df['Score Breakdown'])):
    
    ar = df['Score Breakdown'][i].split(";")
    ln = np.zeros(7)
    for j in range(0,len(ar)):
        k = ar[j][0:ar[j].index(':')]
        value = ar[j][(ar[j].index(':')+1):(ar[j].index('of'))].strip()
        ln[sorter(k)] = value
    ma[i] = ln
            

In [None]:
# converting the arrays created into columns by using the transpose function and deleting the Score Breakdown
df['Value'] = pd.Series(np.transpose(ma)[0]).astype(int)
df['Location'] = pd.Series(np.transpose(ma)[1]).astype(int)
df['Sleep Quality'] = pd.Series(np.transpose(ma)[2]).astype(int)
df['Rooms'] = pd.Series(np.transpose(ma)[3]).astype(int)
df['Cleanliness'] = pd.Series(np.transpose(ma)[4]).astype(int)
df['Service'] = pd.Series(np.transpose(ma)[5]).astype(int)
del df['Score Breakdown']

In [None]:
#Checking for NAs
df.info()

#### Traveler Type Cleaning

In [None]:
u = df['Traveler Type'].str.partition()
df['Traveler Type'] = pd.DataFrame({ 'Traveler Type': u[2].str.split().str[-1]})
df['Traveler Type'].unique()

In [None]:
#Normalizing the Traveler Type
df['Traveler Type'] = df['Traveler Type'].map({'solo': 1,'couple': 2,\
                                               'friends': 3,'family': 4, 'business': 5})

In [None]:
# Filling NAs randomly of Traveller Type since we do not want to remove more rows
fill_list = [1,2,3,4,5]
df['Traveler Type'] = df['Traveler Type'].fillna(pd.Series(np.random.choice(fill_list, size=len(df.index))))
df['Traveler Type'] = df['Traveler Type'].astype(int)

#### Price Range Cleaning

In [None]:
#Normalizing Price Range
df['Price Range'] = df['Price Range'].map({'$': 1,'$$': 2,\
                                               '$$$': 3,'$$$$': 4})
#Filling missing Values using the mean
df['Price Range']=df['Price Range'].fillna(df['Price Range'].mean())


#### Hotel ID Cleaning

In [None]:
#since we have 40 missing values in this column and the values are unique, we will drop the NAs
df = df[df['Hotel ID'].notna()].reset_index(drop = True)
#converting Hotel ID to int
df['Hotel ID'] = df['Hotel ID'].astype(int)

## EDA

In [None]:
#Grouping by Name(City) to get the top Hotels
HotelCountPerCity = df.groupby(['Name']).agg({'Hotel ID':['count']})
HotelCountPerCity.columns = ['Number of Hotels']
print("Top 5 cities with most Hotels:")
HotelCountPerCity.sort_values(by="Number of Hotels", ascending=False).head()

#### Creating a new dataframe called MeanAndCount to analyze the different types of ratings. It contains:
* `Count` 
* `Value_mean` 
* `Location_mean` 
* `Sleep Quality_mean`
* `Rooms_mean` 
* `Cleanliness_mean` 
* `Service_mean` 

In [None]:
#Grouping by Hotel ID 
MeanAndCount = df.groupby(['Hotel ID']).agg({'Hotel ID':['count'],'Value': ['mean'],'Location': ['mean'], 'Sleep Quality': ['mean'], 'Rooms': ['mean'],\
                              'Cleanliness': ['mean'],'Service': ['mean']})
MeanAndCount.columns = ['Count','Value_mean','Location_mean', 'SleepQuality_mean', 'Rooms_mean','Cleanliness_mean','Service_mean']                
MeanAndCount = MeanAndCount.reset_index()



In [None]:
# getting the Hotels with most reviews
print("Top 5 Hotels with most counts:")
top = MeanAndCount.sort_values(by="Count", ascending=False).head()
top

In [None]:

top.plot(x="Hotel ID", y=["Value_mean", "Location_mean", "SleepQuality_mean", "Rooms_mean","Cleanliness_mean",\
                          "Service_mean"], kind="bar",legend=None, title = "Top 5 hotels with most counts")

In [None]:
# getting the Hotels with highest ratings keeping number of reviews > 10 
print("Top 5 Hotels with highest Value ratings:")
MeanAndCount = MeanAndCount[MeanAndCount['Count'] >10]
top_value = MeanAndCount.sort_values(by="Value_mean", ascending=False).head()
top_value

#### Plotting top 5 hotels with highest mean of the six different categories

In [None]:
# Increasing size of output window
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 65em; }</style>"))

# keeping number of reviews > 10
MeanAndCount = MeanAndCount[MeanAndCount['Count'] >10]

top_value = MeanAndCount.sort_values(by="Value_mean", ascending=False).head()
top_sleepquality = MeanAndCount.sort_values(by="SleepQuality_mean", ascending=False).head()
top_location = MeanAndCount.sort_values(by="Location_mean", ascending=False).head()
top_rooms = MeanAndCount.sort_values(by="Rooms_mean", ascending=False).head()
top_cleanliness = MeanAndCount.sort_values(by="Cleanliness_mean", ascending=False).head()
top_service = MeanAndCount.sort_values(by="Service_mean", ascending=False).head()


from matplotlib import pyplot as plt

fig, (ax1,ax2) = plt.subplots(1,2, figsize=(20,5))

top_value.plot(x="Hotel ID", y=["Value_mean", "Location_mean", "SleepQuality_mean", "Rooms_mean","Cleanliness_mean",\
                          "Service_mean"], kind="bar", ax=ax1, title = "Top 5 hotels with highest Value Ratings")

top_sleepquality.plot(x="Hotel ID", y=["Value_mean", "Location_mean", "SleepQuality_mean", "Rooms_mean","Cleanliness_mean",\
                          "Service_mean"], kind="bar",legend=None, ax=ax2, title = "Top 5 hotels with highest Sleep Quality Ratings")

fig, (ax3,ax4) = plt.subplots(1,2, figsize=(20,5))

top_location.plot(x="Hotel ID", y=["Value_mean", "Location_mean", "SleepQuality_mean", "Rooms_mean","Cleanliness_mean",\
                          "Service_mean"], kind="bar",legend=None, ax=ax3, title = "Top 5 hotels with highest Location Ratings")

top_rooms.plot(x="Hotel ID", y=["Value_mean", "Location_mean", "SleepQuality_mean", "Rooms_mean","Cleanliness_mean",\
                          "Service_mean"], kind="bar",legend=None, ax=ax4, title = "Top 5 hotels with highest Rooms Rating")

fig, (ax5,ax6) = plt.subplots(1,2, figsize=(20,5))

top_cleanliness.plot(x="Hotel ID", y=["Value_mean", "Location_mean", "SleepQuality_mean", "Rooms_mean","Cleanliness_mean",\
                          "Service_mean"], kind="bar",legend=None, ax=ax5, title = "Top 5 hotels with highest Cleanliness Ratings")
               
top_service.plot(x="Hotel ID", y=["Value_mean", "Location_mean", "SleepQuality_mean", "Rooms_mean","Cleanliness_mean",\
                          "Service_mean"], kind="bar",legend=None, ax=ax6, title = "Top 5 hotels with highest Service Ratings")


Looking at the graphs above and the Minimum and Maximum ratings below, we notice that Sleep Quality has always low ratings reaching a minimum of 1.5833 (not shown in the graphs). Hotel ID 16 has the highest Sleep Quality with a 4.17 compared to Hotel ID 17 with a 4.66 in Value ratings. Moreover, we can see that Cleanliness and Service have the two highest ratings in most graphs which that we will not have a lot of negative ratings that mention these two categories, they have a maximum rating of around 4.95 for both. However, this means that after doing the analysis, we should have a lot of negative comments that mention Sleep Quality, Rooms, Value.

In [None]:
# Get a series containing maximum value of each rating column
MeanAndCount1 = MeanAndCount
del MeanAndCount1['Hotel ID']
del MeanAndCount1['Count']
maxRatings = MeanAndCount1.max()
 
print('Maximum average of each rating category : ')
print(maxRatings)

In [None]:
# Get a series containing minimum value of each rating column

minRatings = MeanAndCount1.min()
 
print('Minimum average of each rating category: ')
print(minRatings)

#### Plotting a time series of Year vs. Count

In [None]:
df['Review Date'].groupby([df['Review Date'].dt.year]).agg('count').plot(color = 'C2', kind="line", title = "Trend of comments from 2005 to 2014")


The trend shows an upward trend from 11 comments in 2005 reaching 1000 comments in 2013, an increase of  around 9000% in 8 years. This implies that nowadays we have millions of comments happening each day which means that an analyst working in the hotel management cannot track the comments except if it is automated in splitting comments into good or bad and giving advice to improve the work. This work is done using NLP and ML as shown further in this analysis

#### How are ratings Distributed

In [None]:

# Plot ratings frequency
plt.figure(figsize = [10,5])   # [width, height]

x = list( df['Star Rating'].value_counts().index )
y = list( df['Star Rating'].value_counts() )
plt.barh(x, y)

ticks_x = np.linspace(0, 3000, 6)   # (start, end, number of ticks)
plt.xticks(ticks_x, fontsize=10, family='fantasy', color='black')
plt.yticks( size=15 )

plt.title('Distribution of Ratings', fontsize=20, weight='bold', color='navy', loc='center')
plt.xlabel('Count', fontsize=15, weight='bold', color='navy')
plt.ylabel('Ratings', fontsize=15, weight='bold', color='navy')
plt.legend(['Reviewer Score'], shadow=True, loc=4)