In [1]:
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient 

In [2]:
owner = "dolthub"
database = "us-housing-prices-v2"  # Put the database
branch = "main"


# Set the query to retrieve the data
query = '''SELECT COUNT(*) FROM sales'''
res = requests.get(
  'https://www.dolthub.com/api/v1alpha1/{}/{}/{}'.format(owner, database, branch),
  params={'q': query},
  )

response = res.json()
total_count = int(response['rows'][0]['COUNT(*)'])
print(total_count)

106642382


In [3]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]  # List of all states

# Initialize result storage
all_results = []

# Fetch data for each state
for state in states:
    query = f"SELECT * FROM sales WHERE state = '{state}' LIMIT 10000"
    response = requests.get(f'https://www.dolthub.com/api/v1alpha1/{owner}/{database}/{branch}', params={'q': query})
    
    if response.status_code == 200:
        try:
            data = response.json()
            if "rows" in data:
                all_results.extend(data['rows'])  # Append results from each state
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for {state}:", response.text)  # Print raw response text if JSON decoding fails
    else:
        print(f"Failed to fetch data for {state}: {response.status_code} - {response.text}")  # Handle non-200 responses

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_results)
print(df.shape)
print(df.head())

(30000, 43)
  state property_zip5 property_street_address   property_city property_county  \
0    AZ         85143       1 E CORAL BEAN DR  SAN TAN VALLEY           PINAL   
1    AZ         85143      1 E PEPPERGRASS PL  SAN TAN VALLEY           PINAL   
2    AZ         85173         1 N MESQUITE DR        SUPERIOR           PINAL   
3    AZ         85143      1 W CANYON ROCK RD  SAN TAN VALLEY           PINAL   
4    AZ         85143        1 W MILL REEF DR  SAN TAN VALLEY           PINAL   

  property_id        sale_datetime property_type sale_price seller_1_name  \
0   210572230  2021-04-20 00:00:00   RESIDENTIAL          0          None   
1   210571490  2020-09-16 00:00:00   RESIDENTIAL          0          None   
2   10526004A  2021-02-25 00:00:00          None          0          None   
3   210702840  2011-09-01 00:00:00   RESIDENTIAL      90000          None   
4   210591110  2009-08-14 00:00:00   RESIDENTIAL      65000          None   

   ... land_assessed_date seller_1_sta

In [4]:
# Print the configuration
connection_string = ['connectionString']
CONNECTION_STRING_AZURE_STORAGE = connection_string
CONTAINER_AZURE = 'housingsales'
blob_name = 'housingsales.csv'

#Convert DATAFRAME to csv
output= StringIO()
df.to_csv(output, index=False)
data = output.getvalue()
output.close()

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)

# Get a blob client using the container name and blob name
blob_client = blob_service_client.get_blob_client(container=CONTAINER_AZURE, blob=blob_name)

# Upload the CSV data
blob_client.upload_blob(data, overwrite=True)

print(f"Uploaded {blob_name} to Azure Blob Storage in container {CONTAINER_AZURE}.")

Uploaded housingsales.csv to Azure Blob Storage in container housingsales.
