In [39]:
import pandas as pd
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [40]:
val_data = pd.read_parquet("../output/validation_data_2024-09-26_05-57-30.parquet", engine='pyarrow')
val_data

Unnamed: 0,user_ratings_total,review_rating,review_date_in_days,review_length,distance_to_ski_resort,distance_to_city_center,hotel_id,region_Andorra la Vella,region_Canillo,region_Encamp,...,word2vec_7,word2vec_8,word2vec_9,sentiment_polarity,topic_0_the,topic_1_y,topic_2_the,topic_3_de,topic_4_and,avg_rating
0,276,5.0,0,192,5.503265,0.976173,142,False,False,True,...,-0.384664,0.874879,-0.064979,0.664000,0.005705,0.005574,0.565350,0.050357,0.373013,4.5
1,242,4.0,330,311,1.547996,6.303684,100,False,False,False,...,-0.563974,-0.326978,-0.151919,0.477778,0.079122,0.003544,0.844726,0.003542,0.069066,4.3
2,2281,3.0,1460,50,5.667566,1.410931,147,True,False,False,...,-1.180566,-0.545805,-0.889910,0.850000,0.764521,0.020056,0.020536,0.020028,0.174858,4.0
3,240,1.0,2555,770,5.617771,5.173413,44,False,False,True,...,0.375162,-0.432637,-0.600627,-0.140972,0.347313,0.012637,0.637287,0.001376,0.001387,2.5
4,396,4.0,0,93,5.573342,1.263616,82,True,False,False,...,0.013958,-0.612689,-0.653998,0.522222,0.012051,0.011814,0.502156,0.011818,0.462161,3.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071,1031,5.0,730,30,5.233107,0.223623,4,False,False,False,...,0.622019,-1.069862,-0.317080,0.800000,0.033586,0.033415,0.033587,0.033355,0.866057,4.6
3072,2281,4.0,60,644,5.667566,1.410931,147,False,False,True,...,-0.245219,-0.394026,-0.220700,0.224750,0.181821,0.001874,0.812654,0.001808,0.001843,4.0
3073,1080,1.0,120,224,0.754004,4.885797,165,True,False,False,...,0.145624,0.073493,-0.536434,0.050000,0.004480,0.004364,0.982415,0.004359,0.004383,4.0
3074,211,5.0,2190,126,1.531007,3.987046,48,False,False,False,...,-0.212813,-0.587430,-0.188501,0.358929,0.592649,0.008011,0.008222,0.047983,0.343135,3.9


In [41]:
hotel_map = pd.read_csv("../output/hotel_id_name_mapping_2024-09-26_06-17-06.csv")
hotel_map

Unnamed: 0,hotel_name,hotel_id
0,Borda del Pi,26
1,Insitu Eurotel Andorra,131
2,Hotel Spa Termes Carlemany,116
3,Hotel Camp del Serrat,65
4,Hotel Les Closes,79
...,...,...
162,VIP Plus,155
163,Hotel Riu Blanc,109
164,Apartamentos Canillo Ribagrossa,8
165,"Edificio Tifanis, Atico duplex",36


In [42]:
model_path = "../output/model_gradient_boosting_20240926_061553.pkl"

# Open the .pkl file in read-binary mode and load the model
with open(model_path, 'rb') as file:
    model = pickle.load(file)

model

In [43]:
# Extract the hotel_id column from the validation data
hotel_ids = val_data['hotel_id']

# Separate the features (X) and the target (y) from the validation data
X_val = val_data.drop(columns=['avg_rating'])  # Drop the target column from features
y_val = val_data['avg_rating']  # Target column (true values)

# Use the model to predict the target values
predictions = model.predict(X_val)

# Compare the predicted values with the actual values and include hotel_id
comparison_df = pd.DataFrame({
    'Hotel_ID': hotel_ids,
    'Actual': y_val,
    'Predicted': predictions
})

# Merge the comparison_df with hotel_map to replace hotel_id with hotel_name
comparison_df = comparison_df.merge(hotel_map, left_on='Hotel_ID', right_on='hotel_id', how='left')

# Drop the Hotel_ID and hotel_id columns and keep the hotel_name
comparison_df = comparison_df.drop(columns=['Hotel_ID', 'hotel_id'])

# Rename hotel_name column for clarity
comparison_df.rename(columns={'hotel_name': 'Hotel Name'}, inplace=True)

# Display the comparison DataFrame
comparison_df



Unnamed: 0,Actual,Predicted,Hotel Name
0,4.5,4.015771,Pierre & Vacances Hotel Starc - Premium
1,4.3,3.860161,Hotel Palarine
2,4.0,4.074650,Sercotel Delfos Andorra
3,2.5,3.964157,HOTEL ALFA
4,3.9,4.016149,Hotel Marfany
...,...,...,...
3071,4.6,3.984811,Andorra Park Hotel
3072,4.0,4.075239,Sercotel Delfos Andorra
3073,4.0,3.848657,Yomo Imperial Hotel
3074,3.9,3.862066,HOTEL RESIDÈNCIA ALDOSA


In [44]:
# Calculate metrics
rmse = mean_squared_error(y_val, predictions, squared=False)
mae = mean_absolute_error(y_val, predictions)
r2 = r2_score(y_val, predictions)

# Function to calculate adjusted R²
def adjusted_r_squared(r2, n, p):
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Number of samples (n) and number of features (p)
n = X_val.shape[0]
p = X_val.shape[1]

# Calculate adjusted R²
adj_r2 = adjusted_r_squared(r2, n, p)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")
print(f"Adjusted R²: {adj_r2}")

RMSE: 0.41308212832887786
MAE: 0.3374374203370776
R²: -0.38795981761365694
Adjusted R²: -0.40393961814539336


In [45]:
def get_hotel_id_from_name(hotel_name, hotel_map):
    """
    Get the hotel_id for a given hotel name.
    """
    match = hotel_map[hotel_map['hotel_name'] == hotel_name]
    if match.empty:
        return None
    return match['hotel_id'].values[0]

def get_hotel_data_by_id(hotel_id, val_data):
    """
    Retrieve the row of data for a given hotel_id from the dataset.
    """
    match = val_data[val_data['hotel_id'] == hotel_id]
    if match.empty:
        return None
    return match

def predict_hotel_rating(hotel_name, hotel_map, val_data, model):
    """
    Given a hotel name, retrieve the hotel_id, fetch the data, and make a prediction.
    """
    # Step 1: Get hotel_id from the hotel name
    hotel_id = get_hotel_id_from_name(hotel_name, hotel_map)
    if hotel_id is None:
        return f"No hotel found with the name: {hotel_name}"
    
    # Step 2: Get the hotel data using hotel_id
    hotel_data_row = get_hotel_data_by_id(hotel_id, val_data)
    if hotel_data_row is None:
        return f"No data found for hotel ID: {hotel_id}"
    
    # Step 3: Prepare the data for prediction (drop the target and hotel_id column)
    X_hotel = hotel_data_row.drop(columns=['avg_rating'])
    
    # Step 4: Make the prediction
    predicted_rating = model.predict(X_hotel)[0]
    
    # Step 5: Get the actual avg_rating
    actual_avg_rating = hotel_data_row['avg_rating'].values[0]
    
    # Step 6: Return the result
    return {
        "hotel_name": hotel_name,
        "hotel_id": hotel_id,
        "predicted_rating": predicted_rating,
        "actual_avg_rating": actual_avg_rating,
        "actual_data": hotel_data_row.to_dict(orient='records')[0]  # Original data for reference
    }

In [46]:
hotel_name_input = "HOTEL ALFA"  
result = predict_hotel_rating(hotel_name_input, hotel_map, val_data, model)

# Display the result
if isinstance(result, dict):
    print(f"Hotel Name: {result['hotel_name']}")
    print(f"Hotel ID: {result['hotel_id']}")
    print(f"Predicted Rating: {result['predicted_rating']}")
    print(f"Actual Avg Rating: {result['actual_avg_rating']}")
    print(f"Actual Data: {result['actual_data']}")
else:
    print(result)

Hotel Name: HOTEL ALFA
Hotel ID: 44
Predicted Rating: 3.9641567379937195
Actual Avg Rating: 2.5
Actual Data: {'user_ratings_total': 240, 'review_rating': 1.0, 'review_date_in_days': 2555, 'review_length': 770, 'distance_to_ski_resort': 5.617770775517116, 'distance_to_city_center': 5.173412880492371, 'hotel_id': 44, 'region_Andorra la Vella': False, 'region_Canillo': False, 'region_Encamp': True, 'region_Escaldes-Engordany': False, 'region_La Massana': False, 'region_Ordino': False, 'region_Sant Julià de Lòria': False, 'lang_ca': False, 'lang_en': False, 'lang_es': True, 'lang_fr': False, 'lang_other': False, 'word2vec_0': -1.8311707973480225, 'word2vec_1': -1.6264880895614624, 'word2vec_2': -0.2812776565551758, 'word2vec_3': -0.8127309679985046, 'word2vec_4': -0.09735940396785736, 'word2vec_5': -0.07398884743452072, 'word2vec_6': 0.27463042736053467, 'word2vec_7': 0.3751620352268219, 'word2vec_8': -0.43263670802116394, 'word2vec_9': -0.600626528263092, 'sentiment_polarity': -0.14097222

