# Hotel Customer Satisfaction - Data Creation

Generate data for customer experiences in an example hotel. The generated data consisits of:

- XXXX
- xxxx

The initial data was taken from an example of customer feedback fr a real hotel on Booking.Com

- https://www.kaggle.com/datasets/michelhatab/hotel-reviews-bookingcom/code
- ????? manipulated, additional dat applied

In [11]:
# Libraries
#

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('ggplot')  


In [12]:
# Establish valid data file path, for reading and saving eg CSV files
#

import os
data_directory = 'Data_Sources'          # Designated data folder above the current working directory
project_directory = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(project_directory, data_directory)
if not os.path.isdir(DATA_PATH):
    raise Exception(f'Directory not found: {DATA_PATH}')

# Tidy Up
del data_directory, project_directory

In [35]:
# A simple utility function to obtain and summarise key elements of a given dataframe

def print_file_summary(data_frame):
    # Create a temporary df and ensure no lists remain, so that unique items can be identified for uniqueness
    temp_df = data_frame.copy()
    temp_df = temp_df.map(lambda cell: str(cell) if isinstance(cell, list) else cell)
    
    # Calculate Data Types 
    summary_of_df = pd.DataFrame({'Count': data_frame.count(),
                                 'Missing': data_frame.isnull().sum(), 'Empty': 0,
                                 'Unique': temp_df.nunique(),
                                 'Type': data_frame.dtypes, 
                                 'String': 0, 'Int': 0, 'Float': 0, 'List': 0
                                 })
    summary_of_df['Empty'] = (data_frame == '').sum()
    summary_of_df['String'] = data_frame.map(lambda cell: isinstance(cell, str)).sum()
    summary_of_df['Int'] = data_frame.map(lambda cell: isinstance(cell, int)).sum()
    summary_of_df['Float'] = data_frame.map(lambda cell: isinstance(cell, float)).sum()
    summary_of_df['List'] = data_frame.map(lambda cell: isinstance(cell, list)).sum()

    display(summary_of_df)


## Source Data Load and Check


In [54]:
# Load Source CSV
file_name = 'hotel_reviews.csv'
file_path = os.path.join(DATA_PATH, file_name)
if not os.path.isfile(file_path):
    raise Exception(f'File not found: {file_path}')

hotel_reviews_df = pd.read_csv(file_path)

# Data Cleaning
hotel_reviews_df['Score'] = hotel_reviews_df['Score'].astype(int) 
hotel_reviews_df['Score'] = hotel_reviews_df['Score'] - 1       # Change range from 1:10 to 0:9 for the classifier


# Tidy Up
del file_name, file_path

In [None]:
#| label: csv_file_summary

print(hotel_reviews_df.head(3))
display(hotel_reviews_df.head(3))


In [None]:
#| label: csv_file_types

print_file_summary(hotel_reviews_df)



Invalid columns:
Title: object, PositiveReview: object, NegativeReview: object, GuestName: object, GuestCountry: object, RoomType: object, 
NumberOfNights: object, VisitDate: object, GroupType: object, PropertyResponse: object

In [None]:
#| label: csv_file_statistics

print(hotel_reviews_df.describe())

## Analysis ....

In [None]:
# XGBoost Model Training

import xgboost as xgb
from sklearn.model_selection import train_test_split

# Split the data into features and target y which is the customer satisfaction score
X = hotel_reviews_df.drop(columns=['Score'])
Y = hotel_reviews_df['Score']

# Split the data randomly with 20% for testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the model & fit to the data
model = xgb.XGBClassifier()
model.fit(X_train, Y_train)


In [None]:
# Model Predictions & Feature Importance



In [None]:
# Feature Importance

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Predictions
y_pred = model.predict(X_test)


# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))


xgb.plot_importance(model)
plt.rcParams['figure.figsize'] = [12, 9]
plt.show()