In [1]:
import pandas as pd
import pickle
import boto3
from io import BytesIO


In [2]:
df = pd.read_parquet("../output/l3_data_2024-09-10_18-13-22.parquet", engine='pyarrow')
df

Unnamed: 0,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_length,distance_to_ski_resort,distance_to_city_center,hotel_id,region_Andorra la Vella,region_Canillo,...,word2vec_6,word2vec_7,word2vec_8,word2vec_9,sentiment_polarity,topic_0_the,topic_1_y,topic_2_the,topic_3_de,topic_4_and
0,4.4,581,5.0,730,336,6.893862,9.107310,26,False,False,...,0.243695,-0.060377,-0.495647,-0.192009,0.423571,0.987518,0.003082,0.003161,0.003082,0.003156
1,3.9,1194,3.0,60,216,4.926610,1.180006,131,False,False,...,0.002475,0.150331,0.035843,0.093723,0.196548,0.293537,0.004930,0.691541,0.004956,0.005037
2,4.2,1081,4.0,150,861,5.741960,1.934028,116,False,False,...,0.080041,-0.351631,0.039706,0.280915,0.189063,0.014445,0.007518,0.975661,0.001180,0.001195
3,3.8,601,4.0,730,126,6.124637,2.522790,65,False,False,...,-0.621732,0.161617,0.692527,-0.451414,-0.100000,0.008918,0.008769,0.964395,0.008788,0.009130
4,4.3,1257,4.0,150,128,5.588254,1.346464,79,False,False,...,-0.465931,0.263632,0.283626,0.015608,0.348333,0.633251,0.008378,0.008520,0.008365,0.341487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,4.3,1257,3.0,1460,173,5.588254,1.346464,79,False,False,...,0.731373,0.358356,-0.067819,-0.013850,0.147222,0.925986,0.032465,0.005227,0.031111,0.005211
30754,4.1,136,5.0,1460,32,9.017746,12.642774,140,False,True,...,1.671030,1.382455,-0.204079,-1.183821,0.600000,0.034412,0.033336,0.272269,0.033336,0.626646
30755,4.4,655,5.0,150,233,0.938402,4.448480,145,False,False,...,0.278396,-0.457367,-1.091695,0.205098,0.755000,0.004462,0.004368,0.903477,0.004364,0.083329
30756,4.2,1797,3.0,90,724,5.601433,1.583385,52,True,False,...,-0.100452,-0.220480,0.077032,0.140186,0.060417,0.212847,0.001447,0.675705,0.108538,0.001464


In [3]:
hotel_map = pd.read_csv("../output/hotel_id_name_mapping_2024-09-26_06-17-06.csv")
hotel_map

Unnamed: 0,hotel_name,hotel_id
0,Borda del Pi,26
1,Insitu Eurotel Andorra,131
2,Hotel Spa Termes Carlemany,116
3,Hotel Camp del Serrat,65
4,Hotel Les Closes,79
...,...,...
162,VIP Plus,155
163,Hotel Riu Blanc,109
164,Apartamentos Canillo Ribagrossa,8
165,"Edificio Tifanis, Atico duplex",36


In [4]:
model_path = "../output/model_gradient_boosting_20240930_071847.pkl"

# Open the .pkl file in read-binary mode and load the model
with open(model_path, 'rb') as file:
    model = pickle.load(file)

model

In [7]:
def get_hotel_id_from_name(hotel_name, hotel_map):
    """
    Get the hotel_id for a given hotel name.
    """
    match = hotel_map[hotel_map['hotel_name'] == hotel_name]
    if match.empty:
        return None
    return match['hotel_id'].values[0]

def get_hotel_data_by_id(hotel_id, val_data):
    """
    Retrieve all rows of data for a given hotel_id from the dataset.
    """
    match = val_data[val_data['hotel_id'] == hotel_id]
    if match.empty:
        return None
    return match

def predict_hotel_rating(hotel_name, hotel_map, val_data, model, scaler):
    """
    Given a hotel name, retrieve the hotel_id, fetch all data rows, make predictions, and descale the predictions.
    """
    # Step 1: Get hotel_id from the hotel name
    hotel_id = get_hotel_id_from_name(hotel_name, hotel_map)
    if hotel_id is None:
        return f"No hotel found with the name: {hotel_name}"
    
    # Step 2: Get all rows for the hotel using hotel_id
    hotel_data_rows = get_hotel_data_by_id(hotel_id, val_data)
    if hotel_data_rows is None or hotel_data_rows.empty:
        return f"No data found for hotel ID: {hotel_id}"
    
    # Step 3: Prepare the data for prediction (drop the target and hotel_id columns)
    X_hotel = hotel_data_rows.drop(columns=['avg_rating', 'hotel_id'])  # Remove hotel_id and target
    
    # Step 4: Scale the hotel data using the same scaler
    X_hotel_scaled = scaler.transform(X_hotel)
    
    # Step 5: Make predictions for all rows of the hotel (on scaled data)
    predictions_scaled = model.predict(X_hotel_scaled)
    
    # Step 6: Get the actual avg_rating for all rows and calculate their average
    actual_avg_rating = hotel_data_rows['avg_rating'].mean()
    
    # Step 7: De-scale the predictions (optional, if they were scaled)
    predicted_avg_rating = predictions_scaled.mean()  # Average of predicted ratings

    # Step 8: Return the result with actual vs predicted average ratings
    return {
        "hotel_name": hotel_name,
        "hotel_id": hotel_id,
        "predicted_avg_rating": predicted_avg_rating,
        "actual_avg_rating": actual_avg_rating,
        "num_reviews": len(hotel_data_rows),
        "actual_data_sample": hotel_data_rows.head(1).to_dict(orient='records')[0]  # First record for reference
    }

In [None]:
# Function to load scaler from S3
def load_scaler_from_s3(s3_bucket, scaler_path):
    s3 = boto3.client('s3')
    bucket = s3_bucket
    scaler_key = scaler_path

    # Load the scaler from S3
    response = s3.get_object(Bucket=bucket, Key=scaler_key)
    scaler_bytes = response['Body'].read()

    # Load the scaler using pickle
    scaler = pickle.load(BytesIO(scaler_bytes))
    return scaler

# Load the scaler from S3
scaler = load_scaler_from_s3('andorra-hotels-data-warehouse', 'model_training/validation/scaler_20240930.pkl')


In [13]:
hotel_name_input = "House Duró"  
result = predict_hotel_rating(hotel_name_input, hotel_map, df, model, scaler)

# Display the result
if isinstance(result, dict):
    print(f"Hotel Name: {result['hotel_name']}")
    print(f"Hotel ID: {result['hotel_id']}")
    print(f"Predicted Avg Rating: {result['predicted_avg_rating']}")
    print(f"Actual Avg Rating: {result['actual_avg_rating']}")
    print(f"Number of Reviews: {result['num_reviews']}")
    print(f"Sample Data: {result['actual_data_sample']}")
else:
    print(result)

Hotel Name: Borda del Pi
Hotel ID: 26
Predicted Avg Rating: 3.7508428724679135
Actual Avg Rating: 4.3999999999999995
Number of Reviews: 300
Sample Actual Data: {'avg_rating': 4.4, 'user_ratings_total': 581, 'review_rating': 5.0, 'review_date_in_days': 730, 'review_length': 336, 'distance_to_ski_resort': 6.893862270927463, 'distance_to_city_center': 9.10731018063548, 'hotel_id': 26, 'region_Andorra la Vella': False, 'region_Canillo': False, 'region_Encamp': False, 'region_Escaldes-Engordany': False, 'region_La Massana': False, 'region_Ordino': True, 'region_Sant Julià de Lòria': False, 'lang_ca': False, 'lang_en': True, 'lang_es': False, 'lang_fr': False, 'lang_other': False, 'word2vec_0': -1.7285411357879639, 'word2vec_1': -1.0261670351028442, 'word2vec_2': 0.7095386385917664, 'word2vec_3': -0.02739611268043518, 'word2vec_4': -0.7085558772087097, 'word2vec_5': -0.07339941710233688, 'word2vec_6': 0.24369466304779053, 'word2vec_7': -0.06037738174200058, 'word2vec_8': -0.4956468641757965,

