In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set up paths - when running from notebooks/ directory
current_dir = Path().resolve()
# If we're in notebooks/, go up one level to project root
project_root = current_dir.parent if current_dir.name == "notebooks" else current_dir
data_dir = project_root / "data"

# City folders to process
cities = ["NYC", "Boston", "Washington_DC"]

print(f"Current directory: {current_dir}")
print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"\nProcessing listings for cities: {', '.join(cities)}")


Current directory: /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor/notebooks
Project root: /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor
Data directory: /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor/data

Processing listings for cities: NYC, Boston, Washington_DC


In [2]:
# Process each city's listings
all_cleaned_data = []

for city in cities:
    city_dir = data_dir / city
    listings_path = city_dir / "listings.csv"
    
    if not listings_path.exists():
        print(f"⚠ Skipping {city}: {listings_path} not found")
        continue
    
    print(f"\n{'='*60}")
    print(f"Processing {city}...")
    print(f"{'='*60}")
    
    # Read the listings CSV
    df = pd.read_csv(listings_path)
    print(f"Original shape: {df.shape}")
    
    # -------------------------
    # Basic price cleaning
    # -------------------------
    df_clean = df.copy()
    
    # Remove $ and commas from price, convert to numeric
    price_str = (
        df_clean["price"]
          .astype(str)
          .str.replace("$", "", regex=False)
          .str.replace(",", "", regex=False)
          .str.strip()
    )
    
    df_clean["price"] = pd.to_numeric(price_str, errors="coerce")
    
    # Remove rows with missing or invalid prices
    original_count = len(df_clean)
    df_clean = df_clean[df_clean["price"].notna()]
    df_clean = df_clean[df_clean["price"] > 0]
    
    print(f"After price cleaning shape: {df_clean.shape}")
    print(f"Removed {original_count - len(df_clean)} rows with invalid prices")
    
    # -------------------------
    # Extra row-level cleaning
    # (same logic as model notebook)
    # -------------------------
    rows_before = len(df_clean)
    
    # If host_since + last_review exist, compute host_years here
    if "host_since" in df_clean.columns and "last_review" in df_clean.columns:
        df_clean["host_since_dt"] = pd.to_datetime(df_clean["host_since"], errors="coerce")
        df_clean["last_review_dt"] = pd.to_datetime(df_clean["last_review"], errors="coerce")
        ref_date = df_clean["last_review_dt"].max()
        df_clean["host_years"] = (ref_date - df_clean["host_since_dt"]).dt.days / 365.25
    
    # helper: return all True if column doesn't exist
    def between_if_col(df_local, col, low, high):
        if col in df_local.columns:
            return df_local[col].between(low, high)
        else:
            return pd.Series(True, index=df_local.index)
    
    # base mask
    mask = (
        df_clean["price"].between(10, 1000)
        & between_if_col(df_clean, "accommodates", 1, 10)
        & between_if_col(df_clean, "bedrooms", 0, 8)
        & between_if_col(df_clean, "beds", 0, 10)
        & between_if_col(df_clean, "bathrooms", 0, 5)
        & between_if_col(df_clean, "review_scores_rating", 1, 5)
        & between_if_col(df_clean, "availability_365", 0, 365)
        & between_if_col(df_clean, "reviews_per_month", 0, 20)
    )
    
    # add host_years constraint if we computed it
    if "host_years" in df_clean.columns:
        mask &= df_clean["host_years"].between(0, 20)
    
    df_clean = df_clean[mask]
    
    print(f"[{city}] rows before outlier filters: {rows_before}")
    print(f"[{city}] rows after outlier filters:  {len(df_clean)}")
    
    # Store cleaned data with city identifier
    all_cleaned_data.append((city, df_clean))

print(f"\n{'='*60}")
print(f"Summary: Processed {len(all_cleaned_data)} cities")
print(f"{'='*60}")


Processing NYC...
Original shape: (36111, 79)
After price cleaning shape: (21328, 79)
Removed 14783 rows with invalid prices
[NYC] rows before outlier filters: 21328
[NYC] rows after outlier filters:  14382

Processing Boston...
Original shape: (4419, 79)
After price cleaning shape: (3506, 79)
Removed 913 rows with invalid prices
[Boston] rows before outlier filters: 3506
[Boston] rows after outlier filters:  2669

Processing Washington_DC...
Original shape: (6423, 79)
After price cleaning shape: (4846, 79)
Removed 1577 rows with invalid prices
[Washington_DC] rows before outlier filters: 4846
[Washington_DC] rows after outlier filters:  4075

Summary: Processed 3 cities


In [3]:
# Save cleaned CSVs for each city
processed_dir = data_dir / "processed"
processed_dir.mkdir(exist_ok=True)

saved_files = []

for city, df_clean in all_cleaned_data:
    # Save to processed folder with city name
    out_path = processed_dir / f"{city}_listings_cleaned.csv"
    df_clean.to_csv(out_path, index=False)
    saved_files.append(str(out_path))
    print(f"✓ {city}: Saved {len(df_clean)} listings to {out_path}")

print(f"\n✓ All cleaned data saved!")
print(f"\nSaved files:")
for f in saved_files:
    print(f"  - {f}")


✓ NYC: Saved 14382 listings to /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor/data/processed/NYC_listings_cleaned.csv
✓ Boston: Saved 2669 listings to /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor/data/processed/Boston_listings_cleaned.csv
✓ Washington_DC: Saved 4075 listings to /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor/data/processed/Washington_DC_listings_cleaned.csv

✓ All cleaned data saved!

Saved files:
  - /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor/data/processed/NYC_listings_cleaned.csv
  - /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-Predictor/data/processed/Boston_listings_cleaned.csv
  - /Users/pranavbathula/Library/CloudStorage/Box-Box/DATA MANAGEMENT FOR DATA SCIENCE/Airbnb-Price-P