In [2]:
# Library importation
import joblib
import pickle
import os
import pandas as pd

In [9]:
try:
    df = pd.read_csv('airbnb_boston_seattle_cleaned.csv')
    print(f"Listings data loaded: {len(df):,} records")
except FileNotFoundError:
    print("Error: Run EDA notebook first")
    exit()

Listings data loaded: 9,562 records


In [3]:
# Create model directory
os.makedirs('models', exist_ok=True)

In [4]:
# Create price prediction and K-Means featue from module 4 model development

price_prediction_features = ['accommodates', 'bedrooms', 'beds', 'bathrooms', 'minimum_nights',
    'availability_365', 'number_of_reviews', 'reviews_per_month',
    'review_scores_rating', 'calculated_host_listings_count',
    'latitude', 'longitude', 'occupancy_rate','room_type',
      'city', 'instant_bookable', 'host_is_superhost',
]

clustering_features = [
    'recency',
    'frequency',
    'monetary',
]

In [7]:
# feature verification
print(f"\n Price prediction features ({len(price_prediction_features)}):")
for i, feat in enumerate(price_prediction_features, 1):
    print(f"  {i}. {feat}")

print(f"\nClustering features ({len(clustering_features)}):")
for i, feat in enumerate(clustering_features, 1):
    print(f"  {i}. {feat}")    


 Price prediction features (17):
  1. accommodates
  2. bedrooms
  3. beds
  4. bathrooms
  5. minimum_nights
  6. availability_365
  7. number_of_reviews
  8. reviews_per_month
  9. review_scores_rating
  10. calculated_host_listings_count
  11. latitude
  12. longitude
  13. occupancy_rate
  14. room_type
  15. city
  16. instant_bookable
  17. host_is_superhost

Clustering features (3):
  1. recency
  2. frequency
  3. monetary


In [10]:
# Sace metadata
model_metadata = {
    'price_prediction_features': price_prediction_features,
    'clustering_features': clustering_features,
    'model_version': '1.0',
    'trained_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'training_samples': len(df),  # If df is your training data
    'model_r2': 0.70,  # Your model's R¬≤ score
    'cities': ['Boston', 'Seattle'],
    'room_types': ['Entire home/apt', 'Private room', 'Shared room'],
}

with open('models/model_metadata.pkl', 'wb') as f:
    pickle.dump(model_metadata, f)
print(" model_metadata.pkl")

 model_metadata.pkl


In [11]:
print("\nVerifying saved files...")
files = os.listdir('models')
expected_files = [
    'xgboost_best_model.pkl',
    'price_pred_scaler.pkl', 
    'kmeans_model.pkl',
    'kmeans_scaler.pkl',
    'model_metadata.pkl'
]

for expected in expected_files:
    if expected in files:
        size_mb = os.path.getsize(f'models/{expected}') / (1024*1024)
        print(f"  ‚úÖ {expected} ({size_mb:.2f} MB)")
    else:
        print(f"  ‚ùå {expected} - NOT FOUND!")


Verifying saved files...
  ‚úÖ xgboost_best_model.pkl (0.37 MB)
  ‚úÖ price_pred_scaler.pkl (0.00 MB)
  ‚úÖ kmeans_model.pkl (0.04 MB)
  ‚úÖ kmeans_scaler.pkl (0.00 MB)
  ‚úÖ model_metadata.pkl (0.00 MB)


In [15]:
print("\nTesting model loading...")
try:
    test_price_model = joblib.load('models/xgboost_best_model.pkl')
    test_cluster_model = joblib.load('models/kmeans_model.pkl')
    test_price_scaler = joblib.load('models/price_pred_scaler.pkl')
    test_cluster_scaler = joblib.load('models/kmeans_scaler.pkl')
    
    with open('models/model_metadata.pkl', 'rb') as f:
        test_metadata = pickle.load(f)
    
    print("  ‚úÖ All models load successfully!")
    print(f"  ‚úÖ Metadata loaded: {len(test_metadata)} keys")
    
except Exception as e:
    print(f"  ‚ùå Error loading models: {e}")




Testing model loading...
  ‚úÖ All models load successfully!
  ‚úÖ Metadata loaded: 8 keys


In [16]:
 # Test prediction with dummy data
print("\nüéØ Testing sample prediction...")

# Create sample input
sample_data = pd.DataFrame({
    'accommodates': [4],
    'bedrooms': [2],
    'beds': [2],
    'bathrooms': [1.0],
    'minimum_nights': [2],
    'availability_365': [180],
    'number_of_reviews': [50],
    'reviews_per_month': [2.5],
    'review_scores_rating': [90.0],
    'calculated_host_listings_count': [1],
    'latitude': [42.3601],
    'longitude': [-71.0589],
    'occupancy_rate': [65.0],
    'room_type': [0],  # Encoded
    'city': [0],  # Encoded
    'instant_bookable': [1],
    'host_is_superhost': [0]
})

# Scale and predict
sample_scaled = test_price_scaler.transform(sample_data)
predicted_price = test_price_model.predict(sample_scaled)[0]

print(f"  ‚úÖ Sample prediction: ${predicted_price:.2f}/night")

if 30 < predicted_price < 1000:
    print("  ‚úÖ Prediction is in reasonable range")
else:
    print(f"  ‚ö†Ô∏è  Prediction seems unusual: ${predicted_price:.2f}")
    print("     Please verify your model and scalers")


üéØ Testing sample prediction...
  ‚úÖ Sample prediction: $201.39/night
  ‚úÖ Prediction is in reasonable range


In [17]:
# Test clustering
cluster_sample = pd.DataFrame({
    'recency': [100],
    'frequency': [50],
    'monetary': [10000]
})

cluster_scaled = test_cluster_scaler.transform(cluster_sample)
predicted_cluster = test_cluster_model.predict(cluster_scaled)[0]

cluster_names = {0: 'Standard', 1: 'At-Risk', 2: 'Premium VIP'}
print(f"  ‚úÖ Sample cluster: {cluster_names.get(predicted_cluster, 'Unknown')}")




  ‚úÖ Sample cluster: Standard
