In [1]:
import pandas as pd
import pickle
import boto3
from io import BytesIO


In [2]:
df = pd.read_parquet("../output/nlp_data_2024-09-30_18-18-30.parquet", engine='pyarrow')
df

Unnamed: 0,avg_rating,user_ratings_total,review_rating,review_date_in_days,review_length,distance_to_ski_resort,distance_to_city_center,hotel_id,region_Andorra la Vella,region_Canillo,...,word2vec_6,word2vec_7,word2vec_8,word2vec_9,sentiment_polarity,topic_0_THE,topic_1_the,topic_2_the,topic_3_the,topic_4_de
0,4.4,581,5.0,730,336,6.893862,9.107310,26,False,False,...,0.160082,-0.148101,0.471147,0.062413,0.423571,0.003080,0.003150,0.525369,0.465319,0.003082
1,3.9,1194,3.0,60,216,4.926610,1.180006,131,False,False,...,-0.039460,0.173151,-0.038579,-0.365918,0.196548,0.004930,0.689656,0.295480,0.005009,0.004925
2,4.2,1081,4.0,150,861,5.741960,1.934028,116,False,False,...,-0.133936,-0.357274,-0.188142,0.232284,0.189063,0.001178,0.798811,0.001212,0.191237,0.007562
3,3.8,601,4.0,730,126,6.124637,2.522790,65,False,False,...,-0.752899,0.428369,-0.486332,-0.352189,-0.100000,0.008714,0.964740,0.008902,0.008916,0.008727
4,4.3,1257,4.0,150,128,5.588254,1.346464,79,False,False,...,-0.290174,0.334239,-0.028866,-0.724074,0.348333,0.008351,0.008518,0.966195,0.008565,0.008372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30753,4.3,1257,3.0,1460,173,5.588254,1.346464,79,False,False,...,0.475532,0.286418,0.131844,0.311543,0.147222,0.063531,0.005197,0.093327,0.805021,0.032925
30754,4.1,136,5.0,1460,32,9.017746,12.642774,140,False,True,...,0.575437,1.176968,0.472411,0.071657,0.600000,0.033395,0.034315,0.865169,0.033784,0.033337
30755,4.4,655,5.0,150,233,0.938402,4.448480,145,False,False,...,0.041048,-0.739100,0.845028,0.212180,0.755000,0.004353,0.982335,0.004486,0.004463,0.004363
30756,4.2,1797,3.0,90,724,5.601433,1.583385,52,True,False,...,-0.032448,-0.185168,-0.150236,0.000092,0.060417,0.001448,0.575156,0.075258,0.346675,0.001464


In [3]:
hotel_map = pd.read_csv("../output/hotel_id_name_mapping_2024-09-26_06-17-06.csv")
hotel_map

Unnamed: 0,hotel_name,hotel_id
0,Borda del Pi,26
1,Insitu Eurotel Andorra,131
2,Hotel Spa Termes Carlemany,116
3,Hotel Camp del Serrat,65
4,Hotel Les Closes,79
...,...,...
162,VIP Plus,155
163,Hotel Riu Blanc,109
164,Apartamentos Canillo Ribagrossa,8
165,"Edificio Tifanis, Atico duplex",36


In [4]:
model_path = "../output/model_random_forest_20240930_184443.pkl"

# Open the .pkl file in read-binary mode and load the model
with open(model_path, 'rb') as file:
    model = pickle.load(file)

model

In [5]:
def get_hotel_id_from_name(hotel_name, hotel_map):
    """
    Get the hotel_id for a given hotel name.
    """
    match = hotel_map[hotel_map['hotel_name'] == hotel_name]
    if match.empty:
        return None
    return match['hotel_id'].values[0]

def get_hotel_data_by_id(hotel_id, val_data):
    """
    Retrieve all rows of data for a given hotel_id from the dataset.
    """
    match = val_data[val_data['hotel_id'] == hotel_id]
    if match.empty:
        return None
    return match

def predict_hotel_rating(hotel_name, hotel_map, val_data, model, scaler):
    """
    Given a hotel name, retrieve the hotel_id, fetch all data rows, make predictions, and descale the predictions.
    """
    # Step 1: Get hotel_id from the hotel name
    hotel_id = get_hotel_id_from_name(hotel_name, hotel_map)
    if hotel_id is None:
        return f"No hotel found with the name: {hotel_name}"
    
    # Step 2: Get all rows for the hotel using hotel_id
    hotel_data_rows = get_hotel_data_by_id(hotel_id, val_data)
    if hotel_data_rows is None or hotel_data_rows.empty:
        return f"No data found for hotel ID: {hotel_id}"
    
    # Step 3: Prepare the data for prediction (drop the target and hotel_id columns)
    X_hotel = hotel_data_rows.drop(columns=['avg_rating'])  # Remove target
    
    # Step 4: Scale the hotel data using the same scaler
    X_hotel_scaled = scaler.transform(X_hotel)
    
    # Step 5: Make predictions for all rows of the hotel (on scaled data)
    predictions_scaled = model.predict(X_hotel_scaled)
    
    # Step 6: Get the actual avg_rating for all rows and calculate their average
    actual_avg_rating = hotel_data_rows['avg_rating'].mean()
    
    # Step 7: De-scale the predictions (optional, if they were scaled)
    predicted_avg_rating = predictions_scaled.mean()  # Average of predicted ratings

    # Step 8: Return the result with actual vs predicted average ratings
    return {
        "hotel_name": hotel_name,
        "hotel_id": hotel_id,
        "predicted_avg_rating": predicted_avg_rating,
        "actual_avg_rating": actual_avg_rating,
        "num_reviews": len(hotel_data_rows),
        "actual_data_sample": hotel_data_rows.head(1).to_dict(orient='records')[0]  # First record for reference
    }

In [6]:
# Function to load scaler from S3
def load_scaler_from_s3(s3_bucket, scaler_path):
    s3 = boto3.client('s3')
    bucket = s3_bucket
    scaler_key = scaler_path

    # Load the scaler from S3
    response = s3.get_object(Bucket=bucket, Key=scaler_key)
    scaler_bytes = response['Body'].read()

    # Load the scaler using pickle
    scaler = pickle.load(BytesIO(scaler_bytes))
    return scaler

# Load the scaler from S3
scaler = load_scaler_from_s3('andorra-hotels-data-warehouse', 'model_training/validation/scaler_2024-09-30_18-40-20.pkl')


In [12]:
hotel_name_input = "Hotel NH Collection Andorra Palomé"  
result = predict_hotel_rating(hotel_name_input, hotel_map, df, model, scaler)

# Display the result
if isinstance(result, dict):
    print(f"Hotel Name: {result['hotel_name']}")
    print(f"Hotel ID: {result['hotel_id']}")
    print(f"Predicted Avg Rating: {result['predicted_avg_rating']}")
    print(f"Actual Avg Rating: {result['actual_avg_rating']}")
    print(f"Number of Reviews: {result['num_reviews']}")
    print(f"Sample Data: {result['actual_data_sample']}")
else:
    print(result)

Hotel Name: Hotel NH Collection Andorra Palomé
Hotel ID: 94
Predicted Avg Rating: 4.49792646418863
Actual Avg Rating: 4.5
Number of Reviews: 297
Sample Data: {'avg_rating': 4.5, 'user_ratings_total': 559, 'review_rating': 1.0, 'review_date_in_days': 1460, 'review_length': 2461, 'distance_to_ski_resort': 3.435829481394757, 'distance_to_city_center': 8.173780777905247, 'hotel_id': 94, 'region_Andorra la Vella': False, 'region_Canillo': False, 'region_Encamp': False, 'region_Escaldes-Engordany': False, 'region_La Massana': False, 'region_Ordino': True, 'region_Sant Julià de Lòria': False, 'lang_ca': False, 'lang_en': True, 'lang_es': False, 'lang_fr': False, 'lang_other': False, 'word2vec_0': -0.5558410286903381, 'word2vec_1': -1.5606504678726196, 'word2vec_2': 0.502979576587677, 'word2vec_3': 0.16289754211902618, 'word2vec_4': -0.5506742596626282, 'word2vec_5': -0.26114583015441895, 'word2vec_6': 0.05632537603378296, 'word2vec_7': 0.20414401590824127, 'word2vec_8': 0.2593328058719635, 'w