# ETL: Populate Listing Table from listing_features.csv

This notebook connects to a Neon database and populates the `listing` table with data from `listing_features.csv`.


In [49]:
import pandas as pd
import psycopg2
from psycopg2.extras import execute_values
import os
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables from .env file in the etl folder
# Try multiple possible locations for the .env file
current_dir = Path().resolve()
possible_env_paths = [
    current_dir / ".env",  # If running from etl/ folder
    current_dir / "sql" / "etl" / ".env",  # If running from project root
]

env_path = None
for path in possible_env_paths:
    if path.exists():
        env_path = path
        load_dotenv(path)
        break

if env_path:
    print(f"✓ Loaded .env file from: {env_path}")
else:
    print(f"⚠ .env file not found. Tried:")
    for path in possible_env_paths:
        print(f"  - {path}")

# Set up paths
project_root = current_dir if (current_dir / "data").exists() else current_dir.parent.parent
csv_path = project_root / "data" / "processed" / "listing_features.csv"

print(f"CSV file path: {csv_path}")
print(f"CSV file exists: {csv_path.exists()}")


✓ Loaded .env file from: /Users/anishj29/Airbnb-Price-Predictor/sql/etl/.env
CSV file path: /Users/anishj29/Airbnb-Price-Predictor/data/processed/listing_features.csv
CSV file exists: True


## Database Connection

The connection string is loaded from the `.env` file in this directory.


In [50]:
# Load connection string from .env file
CONNECTION_STRING = os.getenv('NEON_CONNECTION_STRING')

if not CONNECTION_STRING:
    print("⚠ Warning: NEON_CONNECTION_STRING not found in .env file")
    print("Please add NEON_CONNECTION_STRING to the .env file in this directory")
    print("\nExample .env file content:")
    print("NEON_CONNECTION_STRING=postgresql://username:password@hostname/database?sslmode=require")
    raise ValueError("NEON_CONNECTION_STRING is required. Please add it to the .env file.")
else:
    print("✓ Connection string loaded from .env file")


✓ Connection string loaded from .env file


In [51]:
# Test database connection
try:
    conn = psycopg2.connect(CONNECTION_STRING)
    cur = conn.cursor()
    
    # Test query
    cur.execute("SELECT version();")
    version = cur.fetchone()
    print(f"✓ Successfully connected to database!")
    print(f"PostgreSQL version: {version[0]}")
    
    # Check if schema exists
    cur.execute("""
        SELECT schema_name 
        FROM information_schema.schemata 
        WHERE schema_name = 'airbnb';
    """)
    schema_exists = cur.fetchone()
    
    if schema_exists:
        print("✓ 'airbnb' schema exists")
    else:
        print("⚠ 'airbnb' schema does not exist. Please run the schema creation script first.")
    
    cur.close()
    conn.close()
    
except psycopg2.Error as e:
    print(f"✗ Error connecting to database: {e}")
    raise


✓ Successfully connected to database!
PostgreSQL version: PostgreSQL 17.7 (178558d) on aarch64-unknown-linux-gnu, compiled by gcc (Debian 12.2.0-14+deb12u1) 12.2.0, 64-bit
✓ 'airbnb' schema exists


## Load CSV Data


In [52]:
# Load the CSV file
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} rows from CSV")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()


Loaded 21328 rows from CSV

Columns: ['listing_id', 'price', 'borough', 'neighbourhood_name', 'accommodates', 'bedrooms', 'beds', 'price_per_accommodate', 'host_is_superhost', 'number_of_reviews', 'review_scores_rating', 'availability_365']

First few rows:


Unnamed: 0,listing_id,price,borough,neighbourhood_name,accommodates,bedrooms,beds,price_per_accommodate,host_is_superhost,number_of_reviews,review_scores_rating,availability_365
0,40824219,66.0,Queens,Sunnyside,1,1.0,1.0,66.0,t,16,4.81,77
1,40839416,76.0,Manhattan,East Village,1,1.0,1.0,76.0,t,20,4.95,168
2,40843980,97.0,Queens,Ozone Park,6,2.0,3.0,16.166667,t,93,4.14,364
3,40824301,60.0,Brooklyn,Williamsburg,1,2.0,1.0,60.0,t,26,4.92,187
4,40825740,425.0,Brooklyn,Crown Heights,6,3.0,3.0,70.833333,f,1,5.0,224


In [53]:
# Check for missing values and data types
print("Data info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())


Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21328 entries, 0 to 21327
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   listing_id             21328 non-null  int64  
 1   price                  21328 non-null  float64
 2   borough                21328 non-null  object 
 3   neighbourhood_name     21328 non-null  object 
 4   accommodates           21328 non-null  int64  
 5   bedrooms               21242 non-null  float64
 6   beds                   21287 non-null  float64
 7   price_per_accommodate  21328 non-null  float64
 8   host_is_superhost      21019 non-null  object 
 9   number_of_reviews      21328 non-null  int64  
 10  review_scores_rating   14944 non-null  float64
 11  availability_365       21328 non-null  int64  
dtypes: float64(5), int64(4), object(3)
memory usage: 2.0+ MB
None

Missing values:
listing_id                  0
price                       0
borough 

## Populate Listing Table


In [54]:
# Connect to database
conn = psycopg2.connect(CONNECTION_STRING)
cur = conn.cursor()

# Set search path to airbnb schema
cur.execute("SET search_path TO airbnb, public;")
conn.commit()

print("Connected to database and set search path to 'airbnb' schema")


Connected to database and set search path to 'airbnb' schema


### Prepare and Insert Listing Data


In [55]:
# Prepare listing data from CSV
df_listings = df.copy()

print(f"Preparing {len(df_listings)} listings for insertion...")

# Prepare data for insertion
# Map CSV columns to listing table columns
listing_data = []

for _, row in df_listings.iterrows():
    # Convert host_is_superhost from 't'/'f' to boolean
    host_is_superhost = None
    if pd.notna(row.get('host_is_superhost')):
        host_is_superhost = str(row['host_is_superhost']).lower() in ['t', 'true', '1', 'yes']
    
    listing_row = (
        int(row['listing_id']),  # listing_id
        None,  # neighbourhood_id (not populated from CSV)
        None,  # host_id (not in CSV)
        None,  # host_name (not in CSV)
        None,  # host_since (not in CSV)
        host_is_superhost,  # host_is_superhost
        None,  # room_type (not in CSV)
        None,  # property_type (not in CSV)
        int(row['accommodates']) if pd.notna(row.get('accommodates')) else None,  # accommodates
        int(row['bedrooms']) if pd.notna(row.get('bedrooms')) else None,  # bedrooms
        int(row['beds']) if pd.notna(row.get('beds')) else None,  # beds
        None,  # bathrooms (not in CSV)
        None,  # bathrooms_text (not in CSV)
        None,  # latitude (not in CSV)
        None,  # longitude (not in CSV)
        float(row['price']) if pd.notna(row.get('price')) else None,  # price
        None,  # maximum_nights (not in CSV)
        int(row['number_of_reviews']) if pd.notna(row.get('number_of_reviews')) else None,  # number_of_reviews
        int(row['availability_365']) if pd.notna(row.get('availability_365')) else None,  # availability_365
        None,  # first_review (not in CSV)
        None,  # last_review (not in CSV)
        float(row['review_scores_rating']) if pd.notna(row.get('review_scores_rating')) else None,  # review_scores_rating
        None,  # instant_bookable (not in CSV)
        None,  # calculated_host_listings_count (not in CSV)
        None,  # reviews_per_month (not in CSV)
    )
    listing_data.append(listing_row)

print(f"✓ Prepared {len(listing_data)} listings for insertion")


Preparing 21328 listings for insertion...
✓ Prepared 21328 listings for insertion


In [56]:
# Check existing listings count
cur.execute("SELECT COUNT(*) FROM listing;")
existing_listings = cur.fetchone()[0]
print(f"Existing listings in database: {existing_listings}")

# Insert listings
insert_listing_query = """
    INSERT INTO listing (
        listing_id, neighbourhood_id, host_id, host_name, host_since, host_is_superhost,
        room_type, property_type, accommodates, bedrooms, beds, bathrooms, bathrooms_text,
        latitude, longitude, price, maximum_nights, number_of_reviews, availability_365,
        first_review, last_review, review_scores_rating, instant_bookable,
        calculated_host_listings_count, reviews_per_month
    )
    VALUES %s
    ON CONFLICT (listing_id) DO UPDATE SET
        host_is_superhost = EXCLUDED.host_is_superhost,
        accommodates = EXCLUDED.accommodates,
        bedrooms = EXCLUDED.bedrooms,
        beds = EXCLUDED.beds,
        price = EXCLUDED.price,
        number_of_reviews = EXCLUDED.number_of_reviews,
        availability_365 = EXCLUDED.availability_365,
        review_scores_rating = EXCLUDED.review_scores_rating;
"""

# Insert in batches for better performance
batch_size = 1000
total_inserted = 0

for i in range(0, len(listing_data), batch_size):
    batch = listing_data[i:i+batch_size]
    execute_values(cur, insert_listing_query, batch)
    total_inserted += len(batch)
    if (i // batch_size + 1) % 10 == 0:
        print(f"  Processed {total_inserted} listings...")

conn.commit()

cur.execute("SELECT COUNT(*) FROM listing;")
new_listings = cur.fetchone()[0]
print(f"\n✓ Inserted/updated {new_listings - existing_listings} listings")
print(f"Total listings in database: {new_listings}")


Existing listings in database: 0
  Processed 10000 listings...
  Processed 20000 listings...

✓ Inserted/updated 21328 listings
Total listings in database: 21328


## Verification

Let's verify the data was inserted correctly.


In [57]:
# Check listing table
cur.execute("SELECT COUNT(*) FROM listing;")
listing_count = cur.fetchone()[0]
print(f"Total listings in database: {listing_count}")

# Sample data
print("\nSample listings:")
cur.execute("""
    SELECT listing_id, price, accommodates, bedrooms, beds, 
           host_is_superhost, number_of_reviews, review_scores_rating, availability_365
    FROM listing 
    LIMIT 5;
""")
for row in cur.fetchall():
    print(f"  Listing ID: {row[0]}, Price: ${row[1]:.2f}, "
          f"Accommodates: {row[2]}, Bedrooms: {row[3]}, Beds: {row[4]}, "
          f"Superhost: {row[5]}, Reviews: {row[6]}, Rating: {row[7]}, Availability: {row[8]}")

# Check data quality
print("\nData quality checks:")
cur.execute("SELECT COUNT(*) FROM listing WHERE price IS NULL;")
null_price = cur.fetchone()[0]
print(f"  Listings with null price: {null_price}")

cur.execute("SELECT COUNT(*) FROM listing WHERE listing_id IS NOT NULL;")
valid_listings = cur.fetchone()[0]
print(f"  Valid listings: {valid_listings}")

cur.execute("SELECT COUNT(*) FROM listing WHERE host_is_superhost IS NOT NULL;")
has_superhost = cur.fetchone()[0]
print(f"  Listings with superhost info: {has_superhost}")

cur.execute("SELECT COUNT(*) FROM listing WHERE review_scores_rating IS NOT NULL;")
has_rating = cur.fetchone()[0]
print(f"  Listings with ratings: {has_rating}")


Total listings in database: 21328

Sample listings:
  Listing ID: 40824219, Price: $66.00, Accommodates: 1, Bedrooms: 1, Beds: 1, Superhost: True, Reviews: 16, Rating: 4.81, Availability: 77
  Listing ID: 40839416, Price: $76.00, Accommodates: 1, Bedrooms: 1, Beds: 1, Superhost: True, Reviews: 20, Rating: 4.95, Availability: 168
  Listing ID: 40843980, Price: $97.00, Accommodates: 6, Bedrooms: 2, Beds: 3, Superhost: True, Reviews: 93, Rating: 4.14, Availability: 364
  Listing ID: 40824301, Price: $60.00, Accommodates: 1, Bedrooms: 2, Beds: 1, Superhost: True, Reviews: 26, Rating: 4.92, Availability: 187
  Listing ID: 40825740, Price: $425.00, Accommodates: 6, Bedrooms: 3, Beds: 3, Superhost: False, Reviews: 1, Rating: 5.00, Availability: 224

Data quality checks:
  Listings with null price: 0
  Valid listings: 21328
  Listings with superhost info: 21019
  Listings with ratings: 14944


In [58]:
# Close database connection
cur.close()
conn.close()
print("✓ Database connection closed")


✓ Database connection closed
