# ETL: Populate Database from CSV Files

This notebook populates the local SQLite database with data from:
- `neighbourhoods.csv` â†’ `neighbourhood` table
- `listings_cleaned.csv` â†’ `listing` table (with neighbourhood_id foreign key)

**Prerequisites:**
1. Run `sql/setup_local_db_sqlite.sh` to create the database and schema
2. Or manually run: `sqlite3 data/airbnb.db < sql/schema/01_logical_schema_sqlite.sql`

The database file will be created at `data/airbnb.db`


In [None]:
import pandas as pd
import sqlite3
import os
from pathlib import Path

# Set up paths
current_dir = Path().resolve()
project_root = current_dir if (current_dir / "data").exists() else current_dir.parent.parent

# Database file path
db_path = project_root / "data" / "airbnb.db"

# CSV file paths
# Raw inputs live under data/raw
neighbourhoods_path = project_root / "data" / "raw" / "neighbourhoods.csv"
listings_path = project_root / "data" / "processed" / "listings_cleaned.csv"
calendar_path = project_root / "data" / "raw" / "calendar.csv"

print(f"Project root: {project_root}")
print(f"\nDatabase file: {db_path} (exists: {db_path.exists()})")
print(f"\nCSV file paths:")
print(f"  Neighbourhoods: {neighbourhoods_path} (exists: {neighbourhoods_path.exists()})")
print(f"  Listings: {listings_path} (exists: {listings_path.exists()})")
print(f"  Calendar: {calendar_path} (exists: {calendar_path.exists()})")


Project root: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor

Database file: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/airbnb.db (exists: True)

CSV file paths:
  Neighbourhoods: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/neighbourhoods.csv (exists: True)
  Listings: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/processed/listings_cleaned.csv (exists: True)
  Calendar: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/calendar.csv (exists: True)


## Database Connection

Connect to the SQLite database file. The database file should be at `data/airbnb.db`.


In [12]:
# SQLite database connection
# The database file path is set above
print("SQLite Database Configuration:")
print(f"  Database file: {db_path}")
print(f"  File exists: {db_path.exists()}")

if not db_path.exists():
    print("\nâš  Warning: Database file does not exist!")
    print("Please run 'sql/setup_local_db_sqlite.sh' to create the database and schema.")
    print("Or manually run: sqlite3 data/airbnb.db < sql/schema/01_logical_schema_sqlite.sql")
else:
    print("\nâœ“ Database file found")


SQLite Database Configuration:
  Database file: /Users/anishj29/Desktop/Github Projects/Airbnb-Price-Predictor/data/airbnb.db
  File exists: True

âœ“ Database file found


In [13]:
# Test database connection
try:
    conn = sqlite3.connect(str(db_path))
    # Enable foreign key constraints
    conn.execute("PRAGMA foreign_keys = ON")
    cur = conn.cursor()
    
    # Test query - check SQLite version
    cur.execute("SELECT sqlite_version();")
    version = cur.fetchone()
    print(f"âœ“ Successfully connected to database!")
    print(f"SQLite version: {version[0]}")
    
    # Check if tables exist
    cur.execute("""
        SELECT name FROM sqlite_master 
        WHERE type='table' AND name IN ('neighbourhood', 'listing');
    """)
    tables = [row[0] for row in cur.fetchall()]
    
    expected_tables = ['neighbourhood', 'listing']
    missing_tables = [t for t in expected_tables if t not in tables]
    
    if not missing_tables:
        print(f"âœ“ All tables exist: {', '.join(tables)}")
    else:
        print(f"âš  Missing tables: {', '.join(missing_tables)}")
        print("Please run the schema creation script first.")
    
    cur.close()
    conn.close()
    
except sqlite3.Error as e:
    print(f"âœ— Error connecting to database: {e}")
    raise


âœ“ Successfully connected to database!
SQLite version: 3.50.4
âœ“ All tables exist: neighbourhood, listing, calendar


## Load and Prepare CSV Data


In [14]:
# Load neighbourhoods CSV
print("Loading neighbourhoods.csv...")
df_neighbourhoods = pd.read_csv(neighbourhoods_path)
print(f"  Loaded {len(df_neighbourhoods)} neighbourhoods")
print(f"  Columns: {list(df_neighbourhoods.columns)}")
print(f"  Sample:")
print(df_neighbourhoods.head())

# Load listings CSV
print("\nLoading listings_cleaned.csv...")
df_listings = pd.read_csv(listings_path)
print(f"  Loaded {len(df_listings)} listings")
print(f"  Columns: {list(df_listings.columns)}")
print(f"  Sample:")
print(df_listings.head())

# Load calendar CSV (sample first few rows to check structure)
print(f"  Sample rows:")
print(f"\n  Note: Calendar file is large, will be loaded in batches during insertion")


Loading neighbourhoods.csv...
  Loaded 230 neighbourhoods
  Columns: ['neighbourhood_group', 'neighbourhood']
  Sample:
  neighbourhood_group neighbourhood
0               Bronx      Allerton
1               Bronx    Baychester
2               Bronx       Belmont
3               Bronx     Bronxdale
4               Bronx   Castle Hill

Loading listings_cleaned.csv...


  Loaded 21328 listings
  Columns: ['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability

In [15]:
# Check data quality
print("Neighbourhoods data info:")
print(df_neighbourhoods.info())
print("\nNeighbourhoods missing values:")
print(df_neighbourhoods.isnull().sum())

print("\n" + "="*50)
print("Listings data info:")
print(df_listings.info())
print("\nListings missing values:")
print(df_listings.isnull().sum())


Neighbourhoods data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   neighbourhood_group  230 non-null    object
 1   neighbourhood        230 non-null    object
dtypes: object(2)
memory usage: 3.7+ KB
None

Neighbourhoods missing values:
neighbourhood_group    0
neighbourhood          0
dtype: int64

Listings data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21328 entries, 0 to 21327
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            21328 non-null  int64  
 1   listing_url                                   21328 non-null  object 
 2   scrape_id                                     21328 non-null  int64  
 3   last_scraped            

## Populate Neighbourhood Table

First, we populate the neighbourhood table since the listing table has a foreign key reference to it.


In [16]:
# Prepare and insert neighbourhood data
print(f"Preparing {len(df_neighbourhoods)} neighbourhoods for insertion...")

# Connect to database (always create a fresh connection for this operation)
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA foreign_keys = ON")
cur = conn.cursor()

neighbourhood_data = []
for _, row in df_neighbourhoods.iterrows():
    neighbourhood_row = (
        str(row['neighbourhood_group']) if pd.notna(row.get('neighbourhood_group')) else None,  # borough
        str(row['neighbourhood']) if pd.notna(row.get('neighbourhood')) else None,  # neighbourhood_name
    )
    neighbourhood_data.append(neighbourhood_row)

# Insert neighbourhoods (avoid duplicates by checking first)
# Get existing neighbourhoods
cur.execute("SELECT borough, neighbourhood_name FROM neighbourhood;")
existing_neighbourhoods = set((row[0], row[1]) for row in cur.fetchall())

# Filter out duplicates
new_neighbourhood_data = []
for row in neighbourhood_data:
    if (row[0], row[1]) not in existing_neighbourhoods:
        new_neighbourhood_data.append(row)
        existing_neighbourhoods.add((row[0], row[1]))

if new_neighbourhood_data:
    insert_neighbourhood_query = """
        INSERT INTO neighbourhood (borough, neighbourhood_name)
        VALUES (?, ?);
    """
    cur.executemany(insert_neighbourhood_query, new_neighbourhood_data)
    conn.commit()
    print(f"  Inserted {len(new_neighbourhood_data)} new neighbourhoods")
else:
    print("  All neighbourhoods already exist in database")

cur.execute("SELECT COUNT(*) FROM neighbourhood;")
neighbourhood_count = cur.fetchone()[0]
print(f"âœ“ Inserted {neighbourhood_count} neighbourhoods into database")

# Keep connection open for subsequent cells (don't close here)


Preparing 230 neighbourhoods for insertion...
  All neighbourhoods already exist in database
âœ“ Inserted 230 neighbourhoods into database


## Populate Listing Table


In [17]:
# Connect to database
conn = sqlite3.connect(str(db_path))
# Enable foreign key constraints
conn.execute("PRAGMA foreign_keys = ON")
cur = conn.cursor()

print("Connected to SQLite database")


Connected to SQLite database


### Prepare and Insert Listing Data


In [18]:
# Prepare listing data from listings_cleaned.csv
# df_listings is already loaded from CSV above

print(f"Preparing {len(df_listings)} listings for insertion...")

# Connect to database (use existing connection if available, otherwise create new)
try:
    # Try to use existing connection
    if conn is None or cur is None:
        raise AttributeError
    # Test if connection is still valid
    cur.execute("SELECT 1")
except (NameError, AttributeError):
    # Create new connection if it doesn't exist or is invalid
    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA foreign_keys = ON")
    cur = conn.cursor()

# First, create a lookup dictionary for neighbourhood_id
# This maps (borough, neighbourhood_name) to neighbourhood_id
cur.execute("SELECT neighbourhood_id, borough, neighbourhood_name FROM neighbourhood;")
neighbourhood_lookup = {}
for row in cur.fetchall():
    key = (row[1], row[2])  # (borough, neighbourhood_name)
    neighbourhood_lookup[key] = row[0]

# Helper function to convert 't'/'f' to boolean (0/1 for SQLite)
def str_to_bool(value):
    if pd.isna(value):
        return None
    return 1 if str(value).lower() in ['t', 'true', '1', 'yes'] else 0

# Helper function to convert date string to date
def str_to_date(value):
    if pd.isna(value):
        return None
    try:
        return pd.to_datetime(value).date()
    except:
        return None

# Prepare data for insertion
# Map CSV columns to listing table columns
# Filter out listings with any null/missing values in required columns
listing_data = []

# Define required columns that must not be null
required_columns = [
    'id', 'host_id', 'host_name', 'host_since', 'host_is_superhost',
    'room_type', 'property_type', 'accommodates', 'bedrooms', 'beds',
    'bathrooms', 'bathrooms_text', 'latitude', 'longitude', 'price',
    'number_of_reviews', 'availability_365',
    'first_review', 'last_review', 'review_scores_rating', 'instant_bookable',
    'calculated_host_listings_count', 'reviews_per_month', 'estimated_revenue_l365d',
    'neighbourhood_group_cleansed', 'neighbourhood_cleansed'
]

initial_count = len(df_listings)
print(f"Initial listings: {initial_count}")

for _, row in df_listings.iterrows():
    # Check if any required column has null/missing values
    has_nulls = False
    for col in required_columns:
        if col not in row.index or pd.isna(row.get(col)):
            has_nulls = True
            break
    
    if has_nulls:
        continue  # Skip listings with null values
    
    # Look up neighbourhood_id
    borough = row.get('neighbourhood_group_cleansed')
    neighbourhood_name = row.get('neighbourhood_cleansed')
    neighbourhood_id = None
    if pd.notna(borough) and pd.notna(neighbourhood_name):
        key = (str(borough), str(neighbourhood_name))
        neighbourhood_id = neighbourhood_lookup.get(key)
    
    # Skip if neighbourhood_id lookup failed
    if neighbourhood_id is None:
        continue
    
    listing_row = (
        int(row['id']),  # listing_id (from 'id' column)
        neighbourhood_id,  # neighbourhood_id (looked up)
        int(row['host_id']),  # host_id
        str(row['host_name']),  # host_name
        str_to_date(row.get('host_since')),  # host_since
        str_to_bool(row.get('host_is_superhost')),  # host_is_superhost
        str(row['room_type']),  # room_type
        str(row['property_type']),  # property_type
        int(row['accommodates']),  # accommodates
        int(row['bedrooms']),  # bedrooms
        int(row['beds']),  # beds
        float(row['bathrooms']),  # bathrooms
        str(row['bathrooms_text']),  # bathrooms_text
        float(row['latitude']),  # latitude
        float(row['longitude']),  # longitude
        float(row['price']),  # price
        int(row['number_of_reviews']),  # number_of_reviews
        int(row['availability_365']),  # availability_365
        str_to_date(row.get('first_review')),  # first_review
        str_to_date(row.get('last_review')),  # last_review
        float(row['review_scores_rating']),  # review_scores_rating
        str_to_bool(row.get('instant_bookable')),  # instant_bookable
        int(row['calculated_host_listings_count']),  # calculated_host_listings_count
        float(row['reviews_per_month']),  # reviews_per_month
        float(row['estimated_revenue_l365d']),  # estimated_revenue
    )
    listing_data.append(listing_row)

filtered_count = initial_count - len(listing_data)
print(f"âœ“ Prepared {len(listing_data)} listings for insertion")
print(f"  Filtered out {filtered_count} listings with null/missing values")


Preparing 21328 listings for insertion...
Initial listings: 21328


âœ“ Prepared 14436 listings for insertion
  Filtered out 6892 listings with null/missing values


In [19]:
# Check existing listings count
cur.execute("SELECT COUNT(*) FROM listing;")
existing_listings = cur.fetchone()[0]
print(f"Existing listings in database: {existing_listings}")

# Insert listings
insert_listing_query = """
    INSERT INTO listing (
        listing_id, neighbourhood_id, host_id, host_name, host_since, host_is_superhost,
        room_type, property_type, accommodates, bedrooms, beds, bathrooms, bathrooms_text,
        latitude, longitude, price, number_of_reviews, availability_365,
        first_review, last_review, review_scores_rating, instant_bookable,
        calculated_host_listings_count, reviews_per_month, estimated_revenue
    )
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ON CONFLICT (listing_id) DO UPDATE SET
        host_is_superhost = EXCLUDED.host_is_superhost,
        accommodates = EXCLUDED.accommodates,
        bedrooms = EXCLUDED.bedrooms,
        beds = EXCLUDED.beds,
        price = EXCLUDED.price,
        number_of_reviews = EXCLUDED.number_of_reviews,
        availability_365 = EXCLUDED.availability_365,
        review_scores_rating = EXCLUDED.review_scores_rating,
        estimated_revenue = EXCLUDED.estimated_revenue;
"""

# Insert in batches for better performance
batch_size = 1000
total_inserted = 0

for i in range(0, len(listing_data), batch_size):
    batch = listing_data[i:i+batch_size]
    cur.executemany(insert_listing_query, batch)
    total_inserted += len(batch)
    if (i // batch_size + 1) % 10 == 0:
        print(f"  Processed {total_inserted} listings...")

conn.commit()

cur.execute("SELECT COUNT(*) FROM listing;")
new_listings = cur.fetchone()[0]
print(f"\nâœ“ Inserted/updated {new_listings - existing_listings} listings")
print(f"Total listings in database: {new_listings}")


Existing listings in database: 14436
  Processed 10000 listings...

âœ“ Inserted/updated 0 listings
Total listings in database: 14436


  cur.executemany(insert_listing_query, batch)


## Verification

Let's verify the data was inserted correctly.


In [None]:
# Check listing table
cur.execute("SELECT COUNT(*) FROM listing;")
listing_count = cur.fetchone()[0]
print(f"Total listings in database: {listing_count}")

# Sample data
print("\nSample listings:")
cur.execute("""
    SELECT listing_id, price, accommodates, bedrooms, beds, 
           host_is_superhost, number_of_reviews, review_scores_rating, availability_365
    FROM listing 
    LIMIT 5;
""")
for row in cur.fetchall():
    print(f"  Listing ID: {row[0]}, Price: ${row[1]:.2f}, "
          f"Accommodates: {row[2]}, Bedrooms: {row[3]}, Beds: {row[4]}, "
          f"Superhost: {row[5]}, Reviews: {row[6]}, Rating: {row[7]}, Availability: {row[8]}")

# Check data quality
print("\nData quality checks:")
cur.execute("SELECT COUNT(*) FROM listing WHERE price IS NULL;")
null_price = cur.fetchone()[0]
print(f"  Listings with null price: {null_price}")

cur.execute("SELECT COUNT(*) FROM listing WHERE listing_id IS NOT NULL;")
valid_listings = cur.fetchone()[0]
print(f"  Valid listings: {valid_listings}")

cur.execute("SELECT COUNT(*) FROM listing WHERE host_is_superhost IS NOT NULL;")
has_superhost = cur.fetchone()[0]
print(f"  Listings with superhost info: {has_superhost}")

cur.execute("SELECT COUNT(*) FROM listing WHERE review_scores_rating IS NOT NULL;")
has_rating = cur.fetchone()[0]
print(f"  Listings with ratings: {has_rating}")


Total listings in database: 21328

Sample listings:
  Listing ID: 40824219, Price: $66.00, Accommodates: 1, Bedrooms: 1, Beds: 1, Superhost: True, Reviews: 16, Rating: 4.81, Availability: 77
  Listing ID: 40839416, Price: $76.00, Accommodates: 1, Bedrooms: 1, Beds: 1, Superhost: True, Reviews: 20, Rating: 4.95, Availability: 168
  Listing ID: 40843980, Price: $97.00, Accommodates: 6, Bedrooms: 2, Beds: 3, Superhost: True, Reviews: 93, Rating: 4.14, Availability: 364
  Listing ID: 40824301, Price: $60.00, Accommodates: 1, Bedrooms: 2, Beds: 1, Superhost: True, Reviews: 26, Rating: 4.92, Availability: 187
  Listing ID: 40825740, Price: $425.00, Accommodates: 6, Bedrooms: 3, Beds: 3, Superhost: False, Reviews: 1, Rating: 5.00, Availability: 224

Data quality checks:
  Listings with null price: 0
  Valid listings: 21328
  Listings with superhost info: 21019
  Listings with ratings: 14944


In [None]:
# Close database connection
cur.close()
conn.close()
print("âœ“ Database connection closed")


âœ“ Database connection closed
