In [1]:
#install packages
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from math import ceil
import datetime
import calendar
from sqlalchemy import create_engine

In [2]:
connection_string = ['connectionString']
CONNECTION_STRING_AZURE_STORAGE = connection_string
container_azure = 'housingsales'

# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)

# Get the container client
container_client = blob_service_client.get_container_client(container_azure)

water_toxicity_df = pd.DataFrame()

# List all blobs in the specified container
blob_list = container_client.list_blobs()
for blob in blob_list:
    print(blob.name)
    blob_client = container_client.get_blob_client(blob=blob.name)
    blob_data = blob_client.download_blob()
    blob_content = blob_data.readall().decode('utf-8')
    df = pd.read_csv(StringIO(blob_content))
    # Display the head of the DataFrame
    print(df.head())
    # If I have only one csv, I am doing the following instructions
    housingsales_df = df.copy()


housingsales.csv
  state  property_zip5 property_street_address   property_city  \
0    AZ        85143.0       1 E CORAL BEAN DR  SAN TAN VALLEY   
1    AZ        85143.0      1 E PEPPERGRASS PL  SAN TAN VALLEY   
2    AZ        85173.0         1 N MESQUITE DR        SUPERIOR   
3    AZ        85143.0      1 W CANYON ROCK RD  SAN TAN VALLEY   
4    AZ        85143.0        1 W MILL REEF DR  SAN TAN VALLEY   

  property_county property_id        sale_datetime property_type  sale_price  \
0           PINAL   210572230  2021-04-20 00:00:00   RESIDENTIAL         0.0   
1           PINAL   210571490  2020-09-16 00:00:00   RESIDENTIAL         0.0   
2           PINAL   10526004A  2021-02-25 00:00:00           NaN         0.0   
3           PINAL   210702840  2011-09-01 00:00:00   RESIDENTIAL     90000.0   
4           PINAL   210591110  2009-08-14 00:00:00   RESIDENTIAL     65000.0   

  seller_1_name  ... land_assessed_date  seller_1_state  seller_2_state  \
0           NaN  ...          

  df = pd.read_csv(StringIO(blob_content))


In [3]:
housingsales_df = housingsales_df[housingsales_df['sale_price'] != 0.0]
# Assuming 'housingsales_df' is your pandas DataFrame
housingsales_df['sale_price'] = housingsales_df['sale_price'].astype('float')  # Convert to float if it's not already

# If 'sale_price' represents currency values and you want to represent them as 'money' data type
housingsales_df['sale_price'] = housingsales_df['sale_price'].astype('float').map('${:,.2f}'.format)

# Now 'sale_price' column will be formatted as currency (e.g., $1,234.56)
housingsales_df.head()

Unnamed: 0,state,property_zip5,property_street_address,property_city,property_county,property_id,sale_datetime,property_type,sale_price,seller_1_name,...,land_assessed_date,seller_1_state,seller_2_state,buyer_1_state,buyer_2_state,total_assessed_value,total_appraised_value,land_appraised_value,building_appraised_value,land_type
3,AZ,85143.0,1 W CANYON ROCK RD,SAN TAN VALLEY,PINAL,210702840,2011-09-01 00:00:00,RESIDENTIAL,"$90,000.00",,...,,,,,,,,,,
4,AZ,85143.0,1 W MILL REEF DR,SAN TAN VALLEY,PINAL,210591110,2009-08-14 00:00:00,RESIDENTIAL,"$65,000.00",,...,,,,,,,,,,
8,AZ,85623.0,10 N BACHMAN WASH RD,ORACLE,PINAL,30823026B,2009-07-21 00:00:00,RESIDENTIAL,"$294,254.00",,...,,,,,,,,,,
9,AZ,85132.0,10 N CENTRAL AVE,FLORENCE,PINAL,200440810,2003-11-04 00:00:00,RESIDENTIAL,"$98,000.00",,...,,,,,,,,,,
10,AZ,85138.0,10 N CHERRY LN,MARICOPA,PINAL,501290850,2000-07-17 00:00:00,MOBILE HOME,"$12,500.00",,...,,,,,,,,,,


In [4]:
housingsales_df = housingsales_df.loc[:, ['property_street_address', 'property_id', 'sale_price']]
housingsales_df.head()

Unnamed: 0,property_street_address,property_id,sale_price
3,1 W CANYON ROCK RD,210702840,"$90,000.00"
4,1 W MILL REEF DR,210591110,"$65,000.00"
8,10 N BACHMAN WASH RD,30823026B,"$294,254.00"
9,10 N CENTRAL AVE,200440810,"$98,000.00"
10,10 N CHERRY LN,501290850,"$12,500.00"


In [5]:
# Remove the dollar sign and commas, and convert to float
housingsales_df['sale_price'] = housingsales_df['sale_price'].replace('[\$,]', '', regex=True).astype(float)

# Calculate the desired statistics
min_sale_price = housingsales_df['sale_price'].min()
max_sale_price = housingsales_df['sale_price'].max()
median_sale_price = housingsales_df['sale_price'].median()
average_sale_price = housingsales_df['sale_price'].mean()

# Add these as new columns
housingsales_df['minsaleprice'] = min_sale_price
housingsales_df['maxsaleprice'] = max_sale_price
housingsales_df['mediansaleprice'] = median_sale_price
housingsales_df['averagesaleprice'] = average_sale_price

housingsales_df.head()

Unnamed: 0,property_street_address,property_id,sale_price,minsaleprice,maxsaleprice,mediansaleprice,averagesaleprice
3,1 W CANYON ROCK RD,210702840,90000.0,1.0,378000000.0,59293.0,601239.290016
4,1 W MILL REEF DR,210591110,65000.0,1.0,378000000.0,59293.0,601239.290016
8,10 N BACHMAN WASH RD,30823026B,294254.0,1.0,378000000.0,59293.0,601239.290016
9,10 N CENTRAL AVE,200440810,98000.0,1.0,378000000.0,59293.0,601239.290016
10,10 N CHERRY LN,501290850,12500.0,1.0,378000000.0,59293.0,601239.290016


In [6]:
required_columns = [
    'property_id', 'minsaleprice', 'maxsaleprice', 'mediansaleprice', 'averagesaleprice', 'property_street_address'
]

# Filter the required columns from your original DataFrame
housing_market_df = housingsales_df[required_columns]

# Remove any rows that contain NaN values in these important columns
housing_market_df.dropna(inplace=True)

# Rename the columns to match the 'dim_housingmarket' structure
rename_mapping = {
    'property_id': 'housing_id',
    'minsaleprice': 'minsaleprice',
    'maxsaleprice': 'maxsaleprice',
    'mediansaleprice': 'mediansaleprice',
    'averagesaleprice': 'averagesaleprice',
    'property_street_address': 'houseaddress'
}
housing_market_df.rename(columns=rename_mapping, inplace=True)

# The 'housing_market_df' is now your 'dim_housingmarket' dimension table
dim_housingmarket = housing_market_df

# Display the first few rows of 'dim_housingmarket' to check the structure
dim_housingmarket.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_market_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_market_df.rename(columns=rename_mapping, inplace=True)


Unnamed: 0,housing_id,minsaleprice,maxsaleprice,mediansaleprice,averagesaleprice,houseaddress
3,210702840,1.0,378000000.0,59293.0,601239.290016,1 W CANYON ROCK RD
4,210591110,1.0,378000000.0,59293.0,601239.290016,1 W MILL REEF DR
8,30823026B,1.0,378000000.0,59293.0,601239.290016,10 N BACHMAN WASH RD
9,200440810,1.0,378000000.0,59293.0,601239.290016,10 N CENTRAL AVE
10,501290850,1.0,378000000.0,59293.0,601239.290016,10 N CHERRY LN


In [7]:
# Database connection URL
# Replace the placeholders with your actual database credentials
database_url = 'postgresql://group8:project9!@proj9.postgres.database.azure.com:5432/postgres'

# Create a SQLAlchemy engine
from sqlalchemy import create_engine
engine = create_engine(database_url)

In [8]:
# Assuming engine has been defined
engine = create_engine('postgresql://group8:project9!@proj9.postgres.database.azure.com:5432/postgres', echo=True)

# Your to_sql call
dim_housingmarket.to_sql('dim_housingmarket', con=engine, if_exists='append', index=False)

# Commit the transaction
engine.dispose()  # This ensures that all connections are closed and transactions committed

2024-05-07 02:14:28,369 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2024-05-07 02:14:28,371 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-07 02:14:28,415 INFO sqlalchemy.engine.Engine select current_schema()
2024-05-07 02:14:28,417 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-07 02:14:28,456 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2024-05-07 02:14:28,457 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-07 02:14:28,497 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-07 02:14:28,503 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2024-05-07 02:14:28,504 INFO sqlalchemy.engine.Engine [generated in 0.00131s] {'name': 'dim_housingmarket'}
2024-05-07 02:14:28,620 INFO sqlalchemy.engine.Engine INSERT INTO dim_housingmarket (housing_id, minsaleprice, maxsaleprice, mediansaleprice, averagesaleprice, houseaddress) VALUE

In [9]:
dim_housingmarket.to_csv("dim_housingmarket.csv",index=False)