# Database Viewer

This notebook provides an organized way to view and explore the SQLite database.


In [47]:
import pandas as pd
import numpy as np
from pathlib import Path

current_dir = Path().resolve()

# Find project_root by looking for a "data" folder upwards
if (current_dir / "data").exists():
    project_root = current_dir
elif (current_dir.parent / "data").exists():
    project_root = current_dir.parent
elif (current_dir.parents[1] / "data").exists():
    project_root = current_dir.parents[1]
else:
    # fallback: assume grandparent is project root
    project_root = current_dir.parents[1]

data_dir = project_root / "data"
processed_dir = data_dir / "processed"

print("Project root:", project_root)
print("Data dir:    ", data_dir)
print("Processed dir:", processed_dir)

Project root: /Users/yg25/CSPROJECTS/Airbnb-Price-Predictor
Data dir:     /Users/yg25/CSPROJECTS/Airbnb-Price-Predictor/data
Processed dir: /Users/yg25/CSPROJECTS/Airbnb-Price-Predictor/data/processed


In [48]:
# city counts already from DB: df_city_counts

city_map = {
    "Boston":       {"folder": "Boston",        "processed": "Boston_listings_cleaned.csv"},
    "NYC":          {"folder": "NYC",           "processed": "NYC_listings_cleaned.csv"},
    "Washington DC":{"folder": "Washington_DC", "processed": "Washington_DC_listings_cleaned.csv"},
}

rows_raw = {}
rows_processed = {}

for city, info in city_map.items():
    raw_path = data_dir / info["folder"] / "listings.csv"
    proc_path = processed_dir / info["processed"]
    
    # raw
    if raw_path.exists():
        df_raw = pd.read_csv(raw_path)
        rows_raw[city] = len(df_raw)
    else:
        rows_raw[city] = None
        print(f"âš  Raw file missing for {city}: {raw_path}")
    
    # processed
    if proc_path.exists():
        df_proc = pd.read_csv(proc_path)
        rows_processed[city] = len(df_proc)
    else:
        rows_processed[city] = None
        print(f"âš  Processed file missing for {city}: {proc_path}")

summary = df_city_counts.copy()
summary = summary.rename(columns={"n_listings": "rows_in_db"})

summary["rows_raw_csv"] = summary["city"].map(rows_raw)
summary["rows_processed_csv"] = summary["city"].map(rows_processed)

summary["lost_in_cleaning"] = summary["rows_raw_csv"] - summary["rows_processed_csv"]
summary["lost_loading_to_db"] = summary["rows_processed_csv"] - summary["rows_in_db"]

summary

Unnamed: 0,city,rows_in_db,rows_raw_csv,rows_processed_csv,lost_in_cleaning,lost_loading_to_db
0,Boston,2524,4419,2670,1749,146
1,NYC,14122,36111,14382,21729,260
2,Washington DC,3985,6423,4075,2348,90


In [49]:
import sqlite3
import pandas as pd
from pathlib import Path

# Set up database path
# This notebook is in sql/view/, so we need to go up two levels to get to project root
current_dir = Path().resolve()

# Try different possible locations
if (current_dir / "data").exists():
    # Running from project root
    project_root = current_dir
elif (current_dir.parent / "data").exists():
    # Running from sql/ directory
    project_root = current_dir.parent
elif (current_dir.parent.parent / "data").exists():
    # Running from sql/view/ directory
    project_root = current_dir.parent.parent
else:
    # Fallback: go up two levels from sql/view/
    project_root = current_dir.parent.parent

db_path = project_root / "data" / "airbnb.db"

print(f"Current directory: {current_dir}")
print(f"Project root: {project_root}")
print(f"Database: {db_path}")
print(f"Exists: {db_path.exists()}")


Current directory: /Users/yg25/CSPROJECTS/Airbnb-Price-Predictor/sql/view
Project root: /Users/yg25/CSPROJECTS/Airbnb-Price-Predictor
Database: /Users/yg25/CSPROJECTS/Airbnb-Price-Predictor/data/airbnb.db
Exists: True


## Database Overview


In [50]:
conn = sqlite3.connect(str(db_path))

# Get all tables
tables_query = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"
tables = pd.read_sql_query(tables_query, conn)
# Filter out system tables and calendar
tables = tables[~tables['name'].isin(['calendar', 'sqlite_sequence'])]
print("ðŸ“Š Tables in database:")
print(tables)

# Get row counts for each table
print("\nðŸ“ˆ Row counts:")
for table in tables['name']:
    count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table}", conn)
    print(f"  {table}: {count['count'].iloc[0]:,} rows")


ðŸ“Š Tables in database:
            name
0        listing
1  neighbourhood

ðŸ“ˆ Row counts:
  listing: 20,631 rows
  neighbourhood: 295 rows


## View Neighbourhood Table


In [51]:
df_neighbourhoods = pd.read_sql_query("SELECT * FROM neighbourhood ORDER BY borough, neighbourhood_name", conn)
print(f"Total neighbourhoods: {len(df_neighbourhoods)}")
print(f"\nBoroughs: {df_neighbourhoods['borough'].unique()}")
df_neighbourhoods.head(20)


Total neighbourhoods: 295

Boroughs: [None 'Bronx' 'Brooklyn' 'Manhattan' 'Queens' 'Staten Island']


Unnamed: 0,neighbourhood_id,borough,neighbourhood_name
0,231,,Allston
1,232,,Back Bay
2,233,,Bay Village
3,234,,Beacon Hill
4,235,,Brighton
5,257,,"Brightwood Park, Crestwood, Petworth"
6,258,,"Brookland, Brentwood, Langdon"
7,259,,"Capitol Hill, Lincoln Park"
8,260,,"Capitol View, Marshall Heights, Benning Heights"
9,261,,"Cathedral Heights, McLean Gardens, Glover Park"


## View Listing Table


In [52]:
# Sample listings with neighbourhood info
df_listings = pd.read_sql_query("""
    SELECT *
    FROM listing l
    LEFT JOIN neighbourhood n ON l.neighbourhood_id = n.neighbourhood_id
    LIMIT 30
""", conn)
df_listings


Unnamed: 0,listing_id,neighbourhood_id,city,host_id,host_name,host_since,host_is_superhost,room_type,property_type,accommodates,...,estimated_revenue,first_review,last_review,review_scores_rating,instant_bookable,calculated_host_listings_count,reviews_per_month,neighbourhood_id.1,borough,neighbourhood_name
0,2595,115,NYC,2845,Jennifer,2008-09-09,0,Entire home/apt,Entire rental unit,1,...,0.0,2009-11-21,2022-06-21,4.68,0,3,0.24,115,Manhattan,Midtown
1,3344,268,Washington DC,4957,A.J.,2008-12-10,0,Entire home/apt,Entire condo,2,...,0.0,2009-05-09,2016-08-31,5.0,0,2,0.05,268,,"Downtown, Chinatown, Penn Quarters, Mount Vern..."
2,3686,276,Washington DC,4645,Vita,2008-11-26,0,Private room,Private room in home,1,...,0.0,2010-11-01,2023-08-30,4.64,0,1,0.47,276,,Historic Anacostia
3,3781,240,Boston,4804,Frank,2008-12-03,1,Entire home/apt,Entire rental unit,2,...,0.0,2015-07-10,2024-08-09,4.96,0,1,0.21,240,,East Boston
4,3943,271,Washington DC,5059,Vasa,2008-12-12,1,Private room,Private room in townhouse,2,...,19434.0,2009-05-10,2025-05-27,4.86,0,5,2.78,271,,"Edgewood, Bloomingdale, Truxton Circle, Eckington"
5,4197,259,Washington DC,5061,Sandra,2008-12-12,1,Private room,Private room in home,1,...,9216.0,2009-05-14,2025-05-20,4.88,0,2,0.34,259,,"Capitol Hill, Lincoln Park"
6,5506,251,Boston,8229,Terry,2009-02-19,1,Entire home/apt,Entire guest suite,2,...,6966.0,2009-03-21,2025-07-28,4.82,0,11,0.69,251,,Roxbury
7,5589,279,Washington DC,6527,Ami,2009-01-13,0,Entire home/apt,Entire rental unit,3,...,0.0,2009-09-22,2023-08-17,4.5,0,1,0.5,279,,"Kalorama Heights, Adams Morgan, Lanier Heights"
8,6695,251,Boston,8229,Terry,2009-02-19,1,Entire home/apt,Entire condo,4,...,8064.0,2009-08-06,2025-07-29,4.81,0,11,0.72,251,,Roxbury
9,6848,96,NYC,15991,Allen,2009-05-06,1,Entire home/apt,Entire rental unit,3,...,17280.0,2009-05-25,2025-06-09,4.59,0,1,0.98,96,Brooklyn,Williamsburg


In [54]:
# Statistics
stats = pd.read_sql_query("""
    SELECT 
        COUNT(*) as total_listings,
        ROUND(AVG(price), 2) as avg_price,
        MIN(price) as min_price,
        MAX(price) as max_price,
        ROUND(AVG(accommodates), 2) as avg_accommodates,
        ROUND(AVG(review_scores_rating), 2) as avg_rating
    FROM listing
""", conn)
print("ðŸ“Š Listing Statistics:")
stats


ðŸ“Š Listing Statistics:


Unnamed: 0,total_listings,avg_price,min_price,max_price,avg_accommodates,avg_rating
0,20631,184.85,10.0,1000.0,2.99,4.74


## Custom Queries

Add your own queries here to explore the data.


In [55]:
# Example: Listings by borough
query = """
    SELECT 
        n.borough,
        COUNT(*) as listing_count,
        ROUND(AVG(l.price), 2) as avg_price,
        ROUND(AVG(l.review_scores_rating), 2) as avg_rating
    FROM listing l
    LEFT JOIN neighbourhood n ON l.neighbourhood_id = n.neighbourhood_id
    GROUP BY n.borough
    ORDER BY listing_count DESC
"""
df_borough_stats = pd.read_sql_query(query, conn)
df_borough_stats


Unnamed: 0,borough,listing_count,avg_price,avg_rating
0,,6509,183.62,4.76
1,Manhattan,5501,237.11,4.69
2,Brooklyn,5272,168.6,4.77
3,Queens,2509,132.56,4.76
4,Bronx,604,110.07,4.74
5,Staten Island,236,110.64,4.8


In [56]:
conn.close()
print("âœ“ Connection closed")


âœ“ Connection closed
