In [19]:
# Library importation
import joblib
import pickle
import os
import pandas as pd
from datetime import datetime

In [9]:
try:
    df = pd.read_csv('airbnb_boston_seattle_cleaned.csv')
    print(f"Listings data loaded: {len(df):,} records")
except FileNotFoundError:
    print("Error: Run EDA notebook first")
    exit()

Listings data loaded: 9,562 records


In [3]:
# Create model directory
os.makedirs('models', exist_ok=True)

In [4]:
# Create price prediction and K-Means featue from module 4 model development

price_prediction_features = ['accommodates', 'bedrooms', 'beds', 'bathrooms', 'minimum_nights',
    'availability_365', 'number_of_reviews', 'reviews_per_month',
    'review_scores_rating', 'calculated_host_listings_count',
    'latitude', 'longitude', 'occupancy_rate','room_type',
      'city', 'instant_bookable', 'host_is_superhost',
]

clustering_features = [
    'recency',
    'frequency',
    'monetary',
]

In [7]:
# feature verification
print(f"\n Price prediction features ({len(price_prediction_features)}):")
for i, feat in enumerate(price_prediction_features, 1):
    print(f"  {i}. {feat}")

print(f"\nClustering features ({len(clustering_features)}):")
for i, feat in enumerate(clustering_features, 1):
    print(f"  {i}. {feat}")    


 Price prediction features (17):
  1. accommodates
  2. bedrooms
  3. beds
  4. bathrooms
  5. minimum_nights
  6. availability_365
  7. number_of_reviews
  8. reviews_per_month
  9. review_scores_rating
  10. calculated_host_listings_count
  11. latitude
  12. longitude
  13. occupancy_rate
  14. room_type
  15. city
  16. instant_bookable
  17. host_is_superhost

Clustering features (3):
  1. recency
  2. frequency
  3. monetary


In [20]:
# Create metadata dictionary
model_metadata = {
    'price_prediction_features': price_prediction_features,
    'clustering_features': clustering_features,
    'model_version': '1.0',
    'model_type': 'XGBoost Regressor',
    'clustering_algorithm': 'K-Means',
    'trained_date': datetime.now().strftime('%Y-%m-%d'),
    'training_samples': len(df) if 'df' in locals() else 9562,
    'model_r2_score': 0.70,
    'model_rmse': 72.11,
    'cities': ['Boston', 'Seattle'],
    'room_types': ['Entire home/apt', 'Private room', 'Shared room'],
    'clusters': {
        0: 'Standard Segment (74.1%)',
        1: 'At-Risk Segment (14.8%)',
        2: 'Premium VIP Segment (11.1%)'
    },
    'encoding_type': 'LabelEncoder',
    'feature_scaling': 'StandardScaler',
    'notes': 'Model trained on Boston and Seattle Airbnb listings data'
}

# Save metadata
with open('models/model_metadata.pkl', 'wb') as f:
    pickle.dump(model_metadata, f)

print("  ‚úÖ model_metadata.pkl created")
print(f"     - {len(price_prediction_features)} price prediction features")
print(f"     - {len(clustering_features)} clustering features")

  ‚úÖ model_metadata.pkl created
     - 17 price prediction features
     - 3 clustering features


In [11]:
print("\nVerifying saved files...")
files = os.listdir('models')
expected_files = [
    'xgboost_best_model.pkl',
    'price_pred_scaler.pkl', 
    'kmeans_model.pkl',
    'kmeans_scaler.pkl',
    'model_metadata.pkl'
]

for expected in expected_files:
    if expected in files:
        size_mb = os.path.getsize(f'models/{expected}') / (1024*1024)
        print(f"  ‚úÖ {expected} ({size_mb:.2f} MB)")
    else:
        print(f"  ‚ùå {expected} - NOT FOUND!")


Verifying saved files...
  ‚úÖ xgboost_best_model.pkl (0.37 MB)
  ‚úÖ price_pred_scaler.pkl (0.00 MB)
  ‚úÖ kmeans_model.pkl (0.04 MB)
  ‚úÖ kmeans_scaler.pkl (0.00 MB)
  ‚úÖ model_metadata.pkl (0.00 MB)


In [15]:
print("\nTesting model loading...")
try:
    test_price_model = joblib.load('models/xgboost_best_model.pkl')
    test_cluster_model = joblib.load('models/kmeans_model.pkl')
    test_price_scaler = joblib.load('models/price_pred_scaler.pkl')
    test_cluster_scaler = joblib.load('models/kmeans_scaler.pkl')
    
    with open('models/model_metadata.pkl', 'rb') as f:
        test_metadata = pickle.load(f)
    
    print("  ‚úÖ All models load successfully!")
    print(f"  ‚úÖ Metadata loaded: {len(test_metadata)} keys")
    
except Exception as e:
    print(f"  ‚ùå Error loading models: {e}")




Testing model loading...
  ‚úÖ All models load successfully!
  ‚úÖ Metadata loaded: 8 keys


In [None]:
 # Test prediction with dummy data
print("\nTesting sample prediction...")

# Create sample input
sample_data = pd.DataFrame({
    'accommodates': [4],
    'bedrooms': [2],
    'beds': [2],
    'bathrooms': [1.0],
    'minimum_nights': [2],
    'availability_365': [180],
    'number_of_reviews': [50],
    'reviews_per_month': [2.5],
    'review_scores_rating': [90.0],
    'calculated_host_listings_count': [1],
    'latitude': [42.3601],
    'longitude': [-71.0589],
    'occupancy_rate': [65.0],
    'room_type': [0],  # Encoded
    'city': [0],  # Encoded
    'instant_bookable': [1],
    'host_is_superhost': [0]
})

# Scale and predict
sample_scaled = test_price_scaler.transform(sample_data)
predicted_price = test_price_model.predict(sample_scaled)[0]

print(f"  ‚úÖ Sample prediction: ${predicted_price:.2f}/night")

if 30 < predicted_price < 1000:
    print("  ‚úÖ Prediction is in reasonable range")
else:
    print(f"  ‚ö†Ô∏è  Prediction seems unusual: ${predicted_price:.2f}")
    print("     Please verify your model and scalers")


üéØ Testing sample prediction...
  ‚úÖ Sample prediction: $201.39/night
  ‚úÖ Prediction is in reasonable range


In [17]:
# Test clustering
cluster_sample = pd.DataFrame({
    'recency': [100],
    'frequency': [50],
    'monetary': [10000]
})

cluster_scaled = test_cluster_scaler.transform(cluster_sample)
predicted_cluster = test_cluster_model.predict(cluster_scaled)[0]

cluster_names = {0: 'Standard', 1: 'At-Risk', 2: 'Premium VIP'}
print(f"  ‚úÖ Sample cluster: {cluster_names.get(predicted_cluster, 'Unknown')}")




  ‚úÖ Sample cluster: Standard


In [18]:
# Creating addtional directories

os.makedirs('models', exist_ok=True)
os.makedirs('airbnb-pricing-app', exist_ok=True)
os.makedirs('airbnb-pricing-app/models', exist_ok=True)

In [21]:
required_files = [
    'xgboost_best_model.pkl',
    'kmeans_model.pkl',
    'price_pred_scaler.pkl',
    'kmeans_scaler.pkl',
    'label_encoders.pkl',
    'model_metadata.pkl'
]


# Copy files to their directories

print("\nüì¶ Copying files to app directory...")

import shutil

try:
    for filename in required_files:
        src = f'models/{filename}'
        dst = f'airbnb-pricing-app/models/{filename}'
        if os.path.exists(src):
            shutil.copy2(src, dst)
    
    print("  ‚úÖ All files copied to airbnb-pricing-app/models/")
except Exception as e:
    print(f"  ‚ö†Ô∏è  Error copying files: {e}")


üì¶ Copying files to app directory...
  ‚úÖ All files copied to airbnb-pricing-app/models/


In [None]:
print("\nCreating requirements.txt...")

requirements = """streamlit==1.31.0
pandas==2.1.4
numpy==1.26.3
scikit-learn==1.4.0
xgboost==2.0.3
joblib==1.3.2
plotly==5.18.0
"""

with open('airbnb-pricing-app/requirements.txt', 'w') as f:
    f.write(requirements)

print("  ‚úÖ requirements.txt created")


üìù Creating requirements.txt...
  ‚úÖ requirements.txt created


In [None]:
print("\nCreating .gitignore...")

gitignore_content = """# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
.venv

# Jupyter
.ipynb_checkpoints/
*.ipynb

# Data files (if large)
*.csv
*.xlsx
*.parquet

# IDE
.vscode/
.idea/
*.swp
*.swo

# OS
.DS_Store
Thumbs.db

# Streamlit
.streamlit/secrets.toml
"""

with open('airbnb-pricing-app/.gitignore', 'w') as f:
    f.write(gitignore_content)

print("  ‚úÖ .gitignore created")


üö´ Creating .gitignore...
  ‚úÖ .gitignore created


print("\nCreating README.md...")

readme_content = """# Airbnb Pricing Optimizer

AI-powered pricing optimization and customer segmentation tool for Airbnb hosts.

## Features

- **Price Prediction**: XGBoost ML model with 70% accuracy
- **Customer Segmentation**: K-Means clustering (Standard, At-Risk, Premium VIP)
- **Market Analysis**: Compare your pricing against competitors
- **Actionable Insights**: Personalized recommendations to maximize revenue

## Model Performance

- **Model**: XGBoost Regressor
- **Accuracy**: 70% R¬≤ Score
- **RMSE**: $72.11
- **Training Data**: 9,562 listings from Boston & Seattle
- **ROI**: 12,400%

## Quick Start

### Local Development

1. Clone the repository
```bash
git clone https://github.com/YOUR_USERNAME/airbnb-pricing-app.git
cd airbnb-pricing-app
```

2. Install dependencies
```bash
pip install -r requirements.txt
```

3. Run the app
```bash
streamlit run app.py
```

4. Open your browser to `http://localhost:8501`

### Deployment (Streamlit Cloud)

1. Push this repo to GitHub
2. Go to [share.streamlit.io](https://share.streamlit.io)
3. Connect your GitHub account
4. Select this repository
5. Deploy!

##  Project Structure

```
airbnb-pricing-app/
‚îú‚îÄ‚îÄ app.py                          # Main Streamlit application
‚îú‚îÄ‚îÄ requirements.txt                # Python dependencies
‚îú‚îÄ‚îÄ models/
‚îÇ   ‚îú‚îÄ‚îÄ xgboost_pricing_model.pkl  # Trained pricing model
‚îÇ   ‚îú‚îÄ‚îÄ kmeans_cluster_model.pkl   # Clustering model
‚îÇ   ‚îú‚îÄ‚îÄ price_prediction_scaler.pkl
‚îÇ   ‚îú‚îÄ‚îÄ clustering_scaler.pkl
‚îÇ   ‚îú‚îÄ‚îÄ label_encoders.pkl
‚îÇ   ‚îî‚îÄ‚îÄ model_metadata.pkl
‚îî‚îÄ‚îÄ README.md
```

## Model Features

### Price Prediction (17 features):
- accommodates, bedrooms, beds, bathrooms
- minimum_nights, availability_365
- number_of_reviews, reviews_per_month, review_scores_rating
- calculated_host_listings_count
- latitude, longitude, occupancy_rate
- room_type, city, instant_bookable, host_is_superhost

### Clustering (3 features):
- Recency (days since last booking)
- Frequency (total bookings)
- Monetary (lifetime value)

## Customer Segments

1. **Standard Segment (74.1%)** - Moderate-value properties
2. **At-Risk Segment (14.8%)** - Needs attention
3. **Premium VIP (11.1%)** - High-value properties (52.4% of revenue!)

## Author

**Sodiq Otunba**
- Project: Airbnb Customer Segmentation & Pricing Optimization
- Duration: 14 weeks (Nov 2025 - Feb 2026)
- LearnerID: 154046

## License

This project is for educational and demonstration purposes.

## Acknowledgments

- Inside Airbnb for providing public dataset
- Streamlit for the amazing framework
- XGBoost and scikit-learn teams
"""

with open('airbnb-pricing-app/README.md', 'w') as f:
    f.write(readme_content)

print("  README.md created")