# Trial use of XGBoost


## Create simulated data

- Take Kaggle hotel customer satisfaction file: https://www.kaggle.com/datasets/michelhatab/hotel-reviews-bookingcom/code
- And Kaggle Booking.com customer comments file: https://www.kaggle.com/datasets/michelhatab/hotel-reviews-bookingcom/data


TO DO:
- Tidy up, generate scores
- Just used numeric scores to start ... then add categorical fields
- Combine customer comments from other file



## Libraries and Functions

In [6]:
# Libraries
#

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('ggplot')  

In [7]:
# Establish valid data file path, for reading and saving eg CSV files
#

import os
data_directory = 'Data_Sources'          # Designated data folder above the current working directory
project_directory = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(project_directory, data_directory)
if not os.path.isdir(DATA_PATH):
    raise Exception(f'Directory not found: {DATA_PATH}')

# Tidy Up
del data_directory, project_directory

In [8]:
# A simple utility function to obtain and summarise key elements of a given dataframe

def print_file_summary(data_frame):
    # Create a temporary df and ensure no lists remain, so that unique items can be identified for uniqueness
    temp_df = data_frame.copy()
    temp_df = temp_df.map(lambda cell: str(cell) if isinstance(cell, list) else cell)
    
    # Calculate Data Types 
    summary_of_df = pd.DataFrame({'Count': data_frame.count(),
                                 'Missing': data_frame.isnull().sum(), 'Empty': 0,
                                 'Unique': temp_df.nunique(),
                                 'Type': data_frame.dtypes, 
                                 'String': 0, 'Int': 0, 'Float': 0, 'List': 0
                                 })
    summary_of_df['Empty'] = (data_frame == '').sum()
    summary_of_df['String'] = data_frame.map(lambda cell: isinstance(cell, str)).sum()
    summary_of_df['Int'] = data_frame.map(lambda cell: isinstance(cell, int)).sum()
    summary_of_df['Float'] = data_frame.map(lambda cell: isinstance(cell, float)).sum()
    summary_of_df['List'] = data_frame.map(lambda cell: isinstance(cell, list)).sum()

    display(summary_of_df)

## Files Load & Manipulate

In [9]:
# Load Source CSV files

file_name = 'booking_com_reviews.csv'
file_path = os.path.join(DATA_PATH, file_name)
if not os.path.isfile(file_path):
    raise Exception(f'File not found: {file_path}')

hotel_reviews_df = pd.read_csv(file_path)

file_name = 'hotel_satisfaction_scores.csv'
file_path = os.path.join(DATA_PATH, file_name)
if not os.path.isfile(file_path):
    raise Exception(f'File not found: {file_path}')

hotels_satisfaction_df = pd.read_csv(file_path)

In [10]:
# Initial look at the data

print_file_summary(hotel_reviews_df)
hotel_reviews_df

print_file_summary(hotels_satisfaction_df)
hotels_satisfaction_df

Unnamed: 0,Count,Missing,Empty,Unique,Type,String,Int,Float,List
Title,1521,2,0,455,object,1521,0,2,0
PositiveReview,775,748,0,768,object,775,0,748,0
NegativeReview,434,1089,0,377,object,434,0,1089,0
Score,1523,0,0,12,float64,0,0,1523,0
GuestName,1523,0,0,1055,object,1523,0,0,0
GuestCountry,1523,0,0,68,object,1523,0,0,0
RoomType,1460,63,0,7,object,1460,0,63,0
NumberOfNights,1523,0,0,15,object,1523,0,0,0
VisitDate,1523,0,0,20,object,1523,0,0,0
GroupType,1523,0,0,4,object,1523,0,0,0


Unnamed: 0,Count,Missing,Empty,Unique,Type,String,Int,Float,List
id,103904,0,0,103904,int64,0,103904,0,0
Gender,103904,0,0,2,object,103904,0,0,0
Age,103904,0,0,75,int64,0,103904,0,0
purpose_of_travel,103904,0,0,5,object,103904,0,0,0
Type of Travel,103904,0,0,2,object,103904,0,0,0
Type Of Booking,103904,0,0,3,object,103904,0,0,0
Hotel wifi service,103904,0,0,6,int64,0,103904,0,0
Departure/Arrival convenience,103904,0,0,6,int64,0,103904,0,0
Ease of Online booking,103904,0,0,6,int64,0,103904,0,0
Hotel location,103904,0,0,6,int64,0,103904,0,0


Unnamed: 0,id,Gender,Age,purpose_of_travel,Type of Travel,Type Of Booking,Hotel wifi service,Departure/Arrival convenience,Ease of Online booking,Hotel location,Food and drink,Stay comfort,Common Room entertainment,Checkin/Checkout service,Other service,Cleanliness,satisfaction
0,70172,Male,13,aviation,Personal Travel,Not defined,3,4,3,1,5,5,5,4,5,5,neutral or dissatisfied
1,5047,Male,25,tourism,Group Travel,Group bookings,3,2,3,3,1,1,1,1,4,1,neutral or dissatisfied
2,110028,Female,26,tourism,Group Travel,Group bookings,2,2,2,2,5,5,5,4,4,5,satisfied
3,24026,Female,25,tourism,Group Travel,Group bookings,2,5,5,5,2,2,2,1,4,2,neutral or dissatisfied
4,119299,Male,61,aviation,Group Travel,Group bookings,3,3,3,3,4,5,3,3,3,3,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,Female,23,business,Group Travel,Individual/Couple,2,1,2,3,2,2,2,2,3,2,neutral or dissatisfied
103900,73097,Male,49,tourism,Group Travel,Group bookings,4,4,4,4,2,5,5,5,5,4,satisfied
103901,68825,Male,30,tourism,Group Travel,Group bookings,1,1,1,3,4,5,4,5,5,4,neutral or dissatisfied
103902,54173,Female,22,business,Group Travel,Individual/Couple,1,1,1,5,1,1,1,5,4,1,neutral or dissatisfied


In [11]:
# First Cut data for trial on XGBoost

import random
def gen_random(text_satisfaction ,cleanliness):
    if 'neutral or dissatisfied' in text_satisfaction:
        if cleanliness in [0,1]:
            return 0
        else:
            return random.randint(1,2)
    elif 'satisfied' in text_satisfaction:
        if cleanliness == 5:
            return 5
        else:
            return random.randint(3,4)
    else:
        return 5

# Copy a new df
hotels_satisfaction_new = hotels_satisfaction_df.copy()

# Fabricate an overall satisfaction score 0:5 based on the text satisfaction in source file
hotels_satisfaction_new['score_overall'] = hotels_satisfaction_new.apply(
            lambda row: gen_random(row['satisfaction'], row['Cleanliness']), axis=1)
hotels_satisfaction_new.drop('satisfaction', axis=1, inplace=True)

# Change column names for all scores
hotels_satisfaction_new.rename(columns={'Hotel wifi service': 'score_wifi',
                                        'Departure/Arrival convenience ': 'score_transport',
                                        'Ease of Online booking': 'score_booking',
                                        'Hotel location': 'score_location',
                                        'Food and drink': 'score_restaurant',
                                        'Stay comfort': 'score_spa',
                                        'Common Room entertainment': 'score_sports',
                                        'Checkin/Checkout service': 'score_checkin',
                                        'Other service': 'score_local_sites',
                                        'Cleanliness': 'score_housekeeping'
                                        }, 
                               inplace=True)

# Drop all non-numeric columns to begin with
columns = ['id', 'Gender', 'purpose_of_travel', 'Type of Travel', 'Type Of Booking']
hotels_satisfaction_new.drop(columns, axis=1, inplace=True)
hotels_satisfaction_new.drop(hotels_satisfaction_new.columns[2], axis=1, inplace=True) #?? Can't drop one column by name


print_file_summary(hotels_satisfaction_new)
display(hotels_satisfaction_new)

# Save to CSV file
file_name = 'hotel_reviews.csv'
file_path = os.path.join(DATA_PATH, file_name)
hotels_satisfaction_new.to_csv(file_path)
print(f'Data saved to {file_path}')

Unnamed: 0,Count,Missing,Empty,Unique,Type,String,Int,Float,List
Age,103904,0,0,75,int64,0,103904,0,0
score_wifi,103904,0,0,6,int64,0,103904,0,0
score_booking,103904,0,0,6,int64,0,103904,0,0
score_location,103904,0,0,6,int64,0,103904,0,0
score_restaurant,103904,0,0,6,int64,0,103904,0,0
score_spa,103904,0,0,6,int64,0,103904,0,0
score_sports,103904,0,0,6,int64,0,103904,0,0
score_checkin,103904,0,0,6,int64,0,103904,0,0
score_local_sites,103904,0,0,6,int64,0,103904,0,0
score_housekeeping,103904,0,0,6,int64,0,103904,0,0


Unnamed: 0,Age,score_wifi,score_booking,score_location,score_restaurant,score_spa,score_sports,score_checkin,score_local_sites,score_housekeeping,score_overall
0,13,3,3,1,5,5,5,4,5,5,1
1,25,3,3,3,1,1,1,1,4,1,0
2,26,2,2,2,5,5,5,4,4,5,5
3,25,2,5,5,2,2,2,1,4,2,2
4,61,3,3,3,4,5,3,3,3,3,4
...,...,...,...,...,...,...,...,...,...,...,...
103899,23,2,2,3,2,2,2,2,3,2,2
103900,49,4,4,4,2,5,5,5,5,4,4
103901,30,1,1,3,4,5,4,5,5,4,1
103902,22,1,1,5,1,1,1,5,4,1,0


Data saved to /Users/stuartgow/GitHub/Travel_Co_Analysis/Data_Sources/hotel_reviews.csv
