# Zillow Data Analysis
This notebook loads and joins three Zillow datasets using DuckDB.

In [1]:
import duckdb
import pandas as pd

# Set pandas display options to show more data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

## Load Data into DuckDB

In [2]:
# Connect to DuckDB
conn = duckdb.connect()

# Define CSV file paths
csv_paths = [
  r"C:\Users\JOHNV\Documents\CS\ED\ZILLOW_DATA_962c837a6ccefddddf190101e0bafdaf\ZILLOW_DATA_962c837a6ccefddddf190101e0bafdaf.csv",
  r"C:\Users\JOHNV\Documents\CS\ED\ZILLOW_INDICATORS_e93833a53d6c88463446a364cda611cc\ZILLOW_INDICATORS_e93833a53d6c88463446a364cda611cc.csv",
  r"C:\Users\JOHNV\Documents\CS\ED\ZILLOW_REGIONS_1a51d107db038a83ac171d604cb48d5b\ZILLOW_REGIONS_1a51d107db038a83ac171d604cb48d5b.csv"
]

names = [
  "ZILLOW_DATA",
  "ZILLOW_INDICATORS",
  "ZILLOW_REGIONS"
]

# Drop tables if they exist
for name in names:
  conn.execute(f"DROP TABLE IF EXISTS {name}")

# Create tables from CSV files
for i, csv_path in enumerate(csv_paths):
  conn.execute(f"CREATE TABLE {names[i]} AS SELECT * FROM read_csv_auto('{csv_path.replace(chr(92), chr(92)*2)}')")
  print(f"✓ Loaded {names[i]}")

✓ Loaded ZILLOW_DATA
✓ Loaded ZILLOW_INDICATORS
✓ Loaded ZILLOW_REGIONS
✓ Loaded ZILLOW_REGIONS


## Explore Individual Tables

In [3]:
# Check ZILLOW_DATA
print("ZILLOW_DATA - First 5 rows:")
zillow_data = conn.execute("SELECT * FROM ZILLOW_DATA LIMIT 5").fetchdf()
display(zillow_data)
print(f"\nShape: {conn.execute('SELECT COUNT(*) FROM ZILLOW_DATA').fetchone()[0]} rows")

ZILLOW_DATA - First 5 rows:


Unnamed: 0,indicator_id,region_id,date,value
0,ZATT,3101,1998-01-31,338849.0
1,ZATT,3101,1998-02-28,342993.0
2,ZATT,3101,1998-03-31,346763.0
3,ZATT,3101,1998-04-30,349356.0
4,ZATT,3101,1998-05-31,351981.0



Shape: 159663189 rows


In [4]:
# Check ZILLOW_INDICATORS
print("ZILLOW_INDICATORS - First 5 rows:")
zillow_indicators = conn.execute("SELECT * FROM ZILLOW_INDICATORS LIMIT 5").fetchdf()
display(zillow_indicators)
print(f"\nShape: {conn.execute('SELECT COUNT(*) FROM ZILLOW_INDICATORS').fetchone()[0]} rows")

ZILLOW_INDICATORS - First 5 rows:


Unnamed: 0,indicator_id,indicator,category
0,SAAW,Median Sale Price (Smooth & Seasonally Adjuste...,Inventory and sales
1,SRAM,"Median Sale Price (Raw, All Homes, Monthly)",Inventory and sales
2,NSAM,"Median Days to Pending (Smooth, All Homes, Mon...",Inventory and sales
3,RSNA,ZORI (Smoothed): All Homes Plus Multifamily Ti...,Rentals
4,RSSA,"ZORI (Smoothed, Seasonally Adjusted): All Home...",Rentals



Shape: 56 rows


In [5]:
# Check ZILLOW_REGIONS
print("ZILLOW_REGIONS - First 5 rows:")
zillow_regions = conn.execute("SELECT * FROM ZILLOW_REGIONS LIMIT 5").fetchdf()
display(zillow_regions)
print(f"\nShape: {conn.execute('SELECT COUNT(*) FROM ZILLOW_REGIONS').fetchone()[0]} rows")

ZILLOW_REGIONS - First 5 rows:


Unnamed: 0,region_id,region_type,region
0,96208,zip,"90706;CA;Los Angeles-Long Beach-Anaheim, CA;Be..."
1,394415,metro,"Bridgeport, CT"
2,394653,metro,"Greenville, SC"
3,394312,metro,"Albuquerque, NM"
4,394357,metro,"Bakersfield, CA"



Shape: 89305 rows


## Join All Three Tables

In [6]:
# Join the three tables
query = """
SELECT *
FROM ZILLOW_DATA d
JOIN ZILLOW_INDICATORS i ON d.indicator_id = i.indicator_id
JOIN ZILLOW_REGIONS r ON d.region_id = r.region_id
LIMIT 10
"""

result = conn.execute(query).fetchdf()
print(f"Joined Data: {result.shape[0]} rows × {result.shape[1]} columns")
display(result)

Joined Data: 10 rows × 10 columns


Unnamed: 0,indicator_id,region_id,date,value,indicator_id_1,indicator,category,region_id_1,region_type,region
0,ZATT,3101,1998-01-31,338849.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
1,ZATT,3101,1998-02-28,342993.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
2,ZATT,3101,1998-03-31,346763.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
3,ZATT,3101,1998-04-30,349356.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
4,ZATT,3101,1998-05-31,351981.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
5,ZATT,3101,1998-06-30,354231.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
6,ZATT,3101,1998-07-31,355713.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
7,ZATT,3101,1998-08-31,357529.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
8,ZATT,3101,1998-09-30,360523.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...
9,ZATT,3101,1998-10-31,363858.0,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values,3101,county,Los Angeles County;CA;Los Angeles-Long Beach-A...


## View All Columns

In [7]:
# List all column names
print("All columns in joined data:")
for i, col in enumerate(result.columns, 1):
    print(f"{i:2d}. {col}")

All columns in joined data:
 1. indicator_id
 2. region_id
 3. date
 4. value
 5. indicator_id_1
 6. indicator
 7. category
 8. region_id_1
 9. region_type
10. region


## Get More Rows

In [None]:
# Get more rows if needed
query_more = """
SELECT *
FROM ZILLOW_DATA d
JOIN ZILLOW_INDICATORS i ON d.indicator_id = i.indicator_id
JOIN ZILLOW_REGIONS r ON d.region_id = r.region_id
WHERE i.indicator_id = 'SRAW'
LIMIT 10
"""

result_more = conn.execute(query_more).fetchdf()
print(f"Extended Data: {result_more.shape[0]} rows × {result_more.shape[1]} columns")
display(result_more)

Extended Data: 10 rows × 10 columns


Unnamed: 0,indicator_id,region_id,date,value,indicator_id_1,indicator,category,region_id_1,region_type,region
0,SRAW,395050,2019-03-02,550000.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,395050,metro,"Salinas, CA"
1,SRAW,395030,2019-03-02,190500.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,395030,metro,"Rochester, MN"
2,SRAW,753895,2019-03-02,163950.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,753895,metro,"Lafayette, IN"
3,SRAW,394459,2019-03-02,230500.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,394459,metro,"Charlottesville, VA"
4,SRAW,395011,2019-03-02,140000.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,395011,metro,"Racine, WI"
5,SRAW,394380,2019-03-02,360000.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,394380,metro,"Bend, OR"
6,SRAW,395047,2019-03-02,97000.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,395047,metro,"Saginaw, MI"
7,SRAW,394539,2019-03-02,201760.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,394539,metro,"Dover, DE"
8,SRAW,395007,2019-03-02,192900.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,395007,metro,"Pueblo, CO"
9,SRAW,394710,2019-03-02,103000.0,SRAW,"Median Sale Price (Raw, All Homes, Weekly View)",Inventory and sales,394710,metro,"Jackson, MI"


: 

## Data Info

In [9]:
# Get info about the dataframe
print("DataFrame Info:")
result_more.info()

print("\n" + "="*80)
print("\nBasic Statistics:")
display(result_more.describe())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   indicator_id    10 non-null     object        
 1   region_id       10 non-null     int64         
 2   date            10 non-null     datetime64[us]
 3   value           10 non-null     float64       
 4   indicator_id_1  10 non-null     object        
 5   indicator       10 non-null     object        
 6   category        10 non-null     object        
 7   region_id_1     10 non-null     int64         
 8   region_type     10 non-null     object        
 9   region          10 non-null     object        
dtypes: datetime64[us](1), float64(1), int64(2), object(6)
memory usage: 932.0+ bytes


Basic Statistics:


Unnamed: 0,region_id,date,value,region_id_1
count,10.0,10,10.0,10.0
mean,394881.3,2020-08-22 16:48:00,11.8,394881.3
min,394521.0,2020-08-15 00:00:00,5.0,394521.0
25%,394849.0,2020-08-16 18:00:00,5.0,394849.0
50%,394862.0,2020-08-22 00:00:00,6.0,394862.0
75%,395005.25,2020-08-29 00:00:00,16.0,395005.25
max,395053.0,2020-08-29 00:00:00,33.0,395053.0
std,157.235174,,10.591401,157.235174


## Region Types

In [10]:
# Ajustar display para não cortar o texto
pd.set_option('display.max_colwidth', None)

# Consultar todos os tipos de região (region_type)
query_region_types = """
SELECT 
    region_type
FROM ZILLOW_REGIONS
GROUP BY region_type
"""

region_types = conn.execute(query_region_types).fetchdf()
print("Types of Regions Available:")
display(region_types)

# Mostrar exemplos de cada tipo
print("\n" + "="*80)
print("Examples of Each Region Type:")
print("="*80)
for rt in region_types['region_type']:
    print(f"\n{rt}:")
    examples = conn.execute(f"""
        SELECT region_id, region_type, region
        FROM ZILLOW_REGIONS 
        WHERE region_type = '{rt}'
        LIMIT 3
    """).fetchdf()
    display(examples)

Types of Regions Available:


Unnamed: 0,region_type
0,city
1,zip
2,county
3,metro
4,state
5,neigh



Examples of Each Region Type:

city:


Unnamed: 0,region_id,region_type,region
0,32503,city,"Lincoln;NE;Lincoln, NE;Lancaster County"
1,17384,city,"Chandler;AZ;Phoenix-Mesa-Chandler, AZ;Maricopa County"
2,13437,city,Randolph Township; NJ; New York-Newark-Jersey City; Morris County



zip:


Unnamed: 0,region_id,region_type,region
0,96208,zip,"90706;CA;Los Angeles-Long Beach-Anaheim, CA;Bellflower;Los Angeles County"
1,95315,zip,"87121;NM;Albuquerque, NM;Albuquerque;Bernalillo County"
2,91325,zip,"76244;TX;Dallas-Fort Worth-Arlington, TX;Fort Worth;Tarrant County"



county:


Unnamed: 0,region_id,region_type,region
0,2841,county,"San Diego County;CA;San Diego-Chula Vista-Carlsbad, CA"
1,1286,county,"Orange County;CA;Los Angeles-Long Beach-Anaheim, CA"
2,445,county,"Clark County;NV;Las Vegas-Henderson-Paradise, NV"



metro:


Unnamed: 0,region_id,region_type,region
0,394415,metro,"Bridgeport, CT"
1,394653,metro,"Greenville, SC"
2,394312,metro,"Albuquerque, NM"



state:


Unnamed: 0,region_id,region_type,region
0,9,state,California
1,11,state,Connecticut
2,55,state,Utah



neigh:


Unnamed: 0,region_id,region_type,region
0,274772,neigh,"Northeast Dallas; TX; Dallas-Fort Worth-Arlington, TX; Dallas County; Dallas"
1,273698,neigh,"Far North; TX; Dallas-Fort Worth-Arlington, TX; Dallas County; Dallas"
2,275473,neigh,"Southeast Dallas; TX; Dallas-Fort Worth-Arlington, TX; Dallas County; Dallas"


In [11]:
## Indicators Analysis

In [12]:
# Contar quantos indicadores existem
total_indicators = conn.execute("SELECT COUNT(*) FROM ZILLOW_INDICATORS").fetchone()[0]
print(f"Total de Indicadores: {total_indicators}")

# Mostrar 5 exemplos de indicadores
print("\n" + "="*80)
print("5 Exemplos de Indicadores:")
print("="*80)
examples = conn.execute("""
    SELECT 
        indicator_id,
        indicator,
        category,
    FROM ZILLOW_INDICATORS 
    LIMIT 5
""").fetchdf()
display(examples)

Total de Indicadores: 56

5 Exemplos de Indicadores:


Unnamed: 0,indicator_id,indicator,category
0,SAAW,"Median Sale Price (Smooth & Seasonally Adjusted, All Homes, Weekly View)",Inventory and sales
1,SRAM,"Median Sale Price (Raw, All Homes, Monthly)",Inventory and sales
2,NSAM,"Median Days to Pending (Smooth, All Homes, Monthly)",Inventory and sales
3,RSNA,ZORI (Smoothed): All Homes Plus Multifamily Time Series ($),Rentals
4,RSSA,"ZORI (Smoothed, Seasonally Adjusted): All Homes Plus Multifamily Time Series ($)",Rentals
