In [11]:
import pandas as pd
import os
from sqlalchemy import create_engine, text

In [12]:
db_host = os.getenv('DB_HOST', 'localhost')

try:
    engine.dispose()
except:
    pass

engine = create_engine(
    f"postgresql://postgres:postgres@{db_host}:5432/airbnb",
    pool_pre_ping=True,
    pool_recycle=3600
)

with engine.connect() as conn:
    conn.rollback()

engine

Engine(postgresql://postgres:***@localhost:5432/airbnb)

In [13]:
with engine.connect() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS gold;"))
    conn.commit()


In [14]:
from sqlalchemy import text

create_sql = text("""
CREATE SCHEMA IF NOT EXISTS gold;

DROP TABLE IF EXISTS gold.dim_hosts CASCADE;

CREATE TABLE gold.dim_hosts (
    srk_host_id BIGINT PRIMARY KEY,
    host_id_original BIGINT,
    host_name TEXT,
    host_response_time TEXT,
    host_response_rate NUMERIC,
    host_is_superhost BOOLEAN,
    host_listings_count INT
);

DROP TABLE IF EXISTS gold.dim_locations CASCADE;

CREATE TABLE gold.dim_locations (
    srk_location_id SERIAL PRIMARY KEY,
    latitude NUMERIC NOT NULL,
    longitude NUMERIC NOT NULL,
    UNIQUE(latitude, longitude)
);

DROP TABLE IF EXISTS gold.dim_properties CASCADE;

CREATE TABLE gold.dim_properties (
    srk_property_id SERIAL PRIMARY KEY,
    srk_host_id BIGINT REFERENCES gold.dim_hosts(srk_host_id),
    srk_location_id INT REFERENCES gold.dim_locations(srk_location_id),
    property_type TEXT,
    room_type TEXT,
    accommodates INT,
    bathrooms NUMERIC,
    bedrooms INT,
    beds INT,
    bed_type TEXT,
    instant_bookable BOOLEAN,
    is_business_travel_ready BOOLEAN,
    cancellation_policy TEXT,
    n_amenities INT
);

CREATE INDEX IF NOT EXISTS idx_dim_properties_host_id 
    ON gold.dim_properties(srk_host_id);

CREATE INDEX IF NOT EXISTS idx_dim_properties_location_id 
    ON gold.dim_properties(srk_location_id);

DROP TABLE IF EXISTS gold.dim_reviews CASCADE;

CREATE TABLE gold.dim_reviews (
    srk_review_id SERIAL PRIMARY KEY,
    srk_host_id BIGINT REFERENCES gold.dim_hosts(srk_host_id),
    srk_property_id INT REFERENCES gold.dim_properties(srk_property_id),
    number_of_reviews INT,
    review_scores_rating NUMERIC,
    review_scores_accuracy NUMERIC,
    review_scores_cleanliness NUMERIC,
    review_scores_checkin NUMERIC,
    review_scores_communication NUMERIC,
    review_scores_location NUMERIC,
    review_scores_value NUMERIC
);

DROP TABLE IF EXISTS gold.fact_ocorrencias CASCADE;

CREATE TABLE gold.fact_ocorrencias (
    srk_fact_id SERIAL PRIMARY KEY,
    srk_host_id BIGINT REFERENCES gold.dim_hosts(srk_host_id) ON DELETE SET NULL,
    srk_property_id INT REFERENCES gold.dim_properties(srk_property_id) ON DELETE SET NULL,
    srk_location_id INT REFERENCES gold.dim_locations(srk_location_id) ON DELETE SET NULL,
    srk_review_id INT REFERENCES gold.dim_reviews(srk_review_id) ON DELETE SET NULL,
    price NUMERIC,
    security_deposit NUMERIC,
    cleaning_fee NUMERIC,
    guests_included INT,
    minimum_nights INT,
    ano INT,
    mes INT
);
""")

with engine.connect() as conn:
    conn.execute(create_sql)
    conn.commit()


# Carregando dados da camada Silver

In [15]:
df_silver = pd.read_sql("SELECT * FROM silver.airbnb_2019", engine)
print(f"Registros carregados da camada Silver: {len(df_silver)}")
df_silver.head()

Registros carregados da camada Silver: 98783


Unnamed: 0,host_id,host_name,host_response_time,host_response_rate,host_is_superhost,host_listings_count,latitude,longitude,property_type,room_type,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,is_business_travel_ready,cancellation_policy,ano,mes,n_amenities
0,68997,Matthias,within an hour,100.0,False,2.0,-22.96592,-43.17896,Condominium,Entire home/apt,...,10.0,10.0,10.0,9.0,True,False,strict_14_with_grace_period,2019,4,25
1,102840,Viviane,within a day,88.0,False,3.0,-22.97712,-43.19045,Apartment,Entire home/apt,...,9.0,10.0,10.0,9.0,False,False,strict_14_with_grace_period,2019,4,14
2,135635,Renata,within an hour,100.0,True,1.0,-22.98302,-43.21427,Apartment,Entire home/apt,...,10.0,10.0,10.0,10.0,True,False,strict_14_with_grace_period,2019,4,17
3,153232,Patricia,within a few hours,100.0,True,1.0,-22.98816,-43.19359,Apartment,Entire home/apt,...,10.0,10.0,10.0,10.0,False,False,strict_14_with_grace_period,2019,4,31
4,153691,Patricia Miranda & Paulo,within an hour,100.0,True,1.0,-22.98127,-43.19046,Loft,Entire home/apt,...,10.0,10.0,10.0,10.0,True,False,strict_14_with_grace_period,2019,4,35


# Populando DIM_HOSTS

In [16]:
dim_hosts = df_silver[['host_id', 'host_name', 'host_response_time', 
                        'host_response_rate', 'host_is_superhost', 
                        'host_listings_count']].drop_duplicates(subset=['host_id'])

dim_hosts = dim_hosts.reset_index(drop=True)
dim_hosts.insert(0, 'srk_host_id', range(1, len(dim_hosts) + 1))
dim_hosts.rename(columns={'host_id': 'host_id_original'}, inplace=True)

dim_hosts.to_sql('dim_hosts', schema='gold', con=engine, if_exists='append', index=False)
print(f"âœ… DIM_HOSTS populada com {len(dim_hosts)} registros")
dim_hosts.head()

âœ… DIM_HOSTS populada com 9090 registros


Unnamed: 0,srk_host_id,host_id_original,host_name,host_response_time,host_response_rate,host_is_superhost,host_listings_count
0,1,68997,Matthias,within an hour,100.0,False,2.0
1,2,102840,Viviane,within a day,88.0,False,3.0
2,3,135635,Renata,within an hour,100.0,True,1.0
3,4,153232,Patricia,within a few hours,100.0,True,1.0
4,5,153691,Patricia Miranda & Paulo,within an hour,100.0,True,1.0


# Populando DIM_LOCATIONS

In [17]:
dim_locations = df_silver[['latitude', 'longitude']].drop_duplicates()
dim_locations = dim_locations.reset_index(drop=True)

dim_locations.to_sql('dim_locations', schema='gold', con=engine, if_exists='append', index=False)

dim_locations = pd.read_sql("SELECT * FROM gold.dim_locations", engine)
print(f"âœ… DIM_LOCATIONS populada com {len(dim_locations)} registros")
dim_locations.head()

âœ… DIM_LOCATIONS populada com 21147 registros


Unnamed: 0,srk_location_id,latitude,longitude
0,1,-22.96592,-43.17896
1,2,-22.97712,-43.19045
2,3,-22.98302,-43.21427
3,4,-22.98816,-43.19359
4,5,-22.98127,-43.19046


# Populando DIM_PROPERTIES

In [None]:
df_with_keys = df_silver.merge(
    dim_hosts[['host_id_original', 'srk_host_id']], 
    left_on='host_id', 
    right_on='host_id_original', 
    how='left'
)

df_with_keys = df_with_keys.merge(
    dim_locations[['latitude', 'longitude', 'srk_location_id']], 
    on=['latitude', 'longitude'], 
    how='left'
)

dim_properties = df_with_keys[[
    'srk_host_id', 'srk_location_id', 'property_type', 'room_type',
    'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type',
    'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
    'n_amenities'
]].copy()

dim_properties['accommodates'] = dim_properties['accommodates'].astype('Int64')
dim_properties['bedrooms'] = dim_properties['bedrooms'].astype('Int64')
dim_properties['beds'] = dim_properties['beds'].astype('Int64')
dim_properties['n_amenities'] = dim_properties['n_amenities'].astype('Int64')

dim_properties.to_sql('dim_properties', schema='gold', con=engine, if_exists='append', index=False)

dim_properties = pd.read_sql("SELECT * FROM gold.dim_properties", engine)
print(f"âœ… DIM_PROPERTIES populada com {len(dim_properties)} registros")
dim_properties.head()

# Populando DIM_REVIEWS

In [None]:
dim_properties_loaded = pd.read_sql("SELECT * FROM gold.dim_properties", engine)

df_with_keys['temp_key'] = (
    df_with_keys['srk_host_id'].astype(str) + '_' +
    df_with_keys['srk_location_id'].astype(str) + '_' +
    df_with_keys['property_type'].astype(str) + '_' +
    df_with_keys['room_type'].astype(str) + '_' +
    df_with_keys['accommodates'].astype(str)
)

dim_properties_loaded['temp_key'] = (
    dim_properties_loaded['srk_host_id'].astype(str) + '_' +
    dim_properties_loaded['srk_location_id'].astype(str) + '_' +
    dim_properties_loaded['property_type'].astype(str) + '_' +
    dim_properties_loaded['room_type'].astype(str) + '_' +
    dim_properties_loaded['accommodates'].astype(str)
)

df_with_keys = df_with_keys.merge(
    dim_properties_loaded[['srk_property_id', 'temp_key']], 
    on='temp_key', 
    how='left'
)

dim_reviews = df_with_keys[[
    'srk_host_id', 'srk_property_id', 'number_of_reviews',
    'review_scores_rating', 'review_scores_accuracy', 
    'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location', 
    'review_scores_value'
]].copy()

dim_reviews['number_of_reviews'] = dim_reviews['number_of_reviews'].astype('Int64')

dim_reviews.to_sql('dim_reviews', schema='gold', con=engine, if_exists='append', index=False)

dim_reviews = pd.read_sql("SELECT * FROM gold.dim_reviews", engine)
print(f"âœ… DIM_REVIEWS populada com {len(dim_reviews)} registros")
dim_reviews.head()

âœ… DIM_REVIEWS populada com 682795 registros


Unnamed: 0,srk_review_id,srk_host_id,srk_property_id,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,1,1,1,224,93.0,9.0,10.0,10.0,10.0,10.0,9.0
1,2,1,8407,224,93.0,9.0,10.0,10.0,10.0,10.0,9.0
2,3,1,16360,224,93.0,9.0,10.0,10.0,10.0,10.0,9.0
3,4,1,40603,224,93.0,9.0,10.0,10.0,10.0,10.0,9.0
4,5,1,48463,224,93.0,9.0,10.0,10.0,10.0,10.0,9.0


# Populando FACT_OCORRENCIAS

In [None]:
df_with_keys['temp_review_key'] = (
    df_with_keys['srk_host_id'].astype(str) + '_' +
    df_with_keys['srk_property_id'].astype(str) + '_' +
    df_with_keys['number_of_reviews'].astype(str)
)

dim_reviews['temp_review_key'] = (
    dim_reviews['srk_host_id'].astype(str) + '_' +
    dim_reviews['srk_property_id'].astype(str) + '_' +
    dim_reviews['number_of_reviews'].astype(str)
)

df_with_keys = df_with_keys.merge(
    dim_reviews[['srk_review_id', 'temp_review_key']], 
    on='temp_review_key', 
    how='left'
)

fact_ocorrencias = df_with_keys[[
    'srk_host_id', 'srk_property_id', 'srk_location_id', 'srk_review_id',
    'price', 'security_deposit', 'cleaning_fee', 'guests_included',
    'minimum_nights', 'ano', 'mes'
]].copy()

fact_ocorrencias['guests_included'] = fact_ocorrencias['guests_included'].astype('Int64')
fact_ocorrencias['minimum_nights'] = fact_ocorrencias['minimum_nights'].astype('Int64')
fact_ocorrencias['ano'] = fact_ocorrencias['ano'].astype('Int64')
fact_ocorrencias['mes'] = fact_ocorrencias['mes'].astype('Int64')

fact_ocorrencias.to_sql('fact_ocorrencias', schema='gold', con=engine, if_exists='append', index=False)

print(f"âœ… FACT_OCORRENCIAS populada com {len(fact_ocorrencias)} registros")

with engine.connect() as conn:
    result = conn.execute(text("""
        SELECT 
            (SELECT COUNT(*) FROM gold.dim_hosts) as hosts,
            (SELECT COUNT(*) FROM gold.dim_locations) as locations,
            (SELECT COUNT(*) FROM gold.dim_properties) as properties,
            (SELECT COUNT(*) FROM gold.dim_reviews) as reviews,
            (SELECT COUNT(*) FROM gold.fact_ocorrencias) as facts
    """))
    totais = result.fetchone()
    print(f"\nðŸ“Š Totais finais:")
    print(f"   - Hosts: {totais[0]}")
    print(f"   - Locations: {totais[1]}")
    print(f"   - Properties: {totais[2]}")
    print(f"   - Reviews: {totais[3]}")
    print(f"   - Facts: {totais[4]}")

PendingRollbackError: Can't reconnect until invalid transaction is rolled back.  Please rollback() fully before proceeding (Background on this error at: https://sqlalche.me/e/20/8s2b)