# NYC Apartment Search



## Setup

In [1]:
import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

#display all columns.
pd.set_option('display.max_columns', None)

from sqlalchemy.orm import declarative_base

In [2]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
DATA_311_FILE = DATA_DIR / "resource" / "erm2-nwe9.geojson"
DATA_TREE_FILE = DATA_DIR / "resource" / "5rq2-4hqu.geojson"
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "JkEVszrAdzm7qGLzLjvQVBnnM"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "FILL_ME_IN"
DB_USER = "FILL_ME_IN"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing
The process of cleaning & filtering the data includes:
1. Removing unnecessary columns, and only keeping columns needed to answer questions;
2. Remove invalid data points;
3. Normalize column names & column types where needed;
4. Normalize the Spatial Reference Identifiers (SRID) of any geometry

In [4]:
def download_nyc_geojson_data(url, force=False):
    parsed_url = urllib.parse.urlparse(url)
    url_path = parsed_url.path.strip("/")
    
    filename = DATA_DIR / url_path
    
    if force or not filename.exists():
        filename.parent.mkdir(parents=True, exist_ok=True)
        print(f"Downloading {url} to {filename}...")
        
        # Download data using requests
        response = requests.get(url)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Save the downloaded data as a GeoJSON file
            with open(filename, "w") as f:
                json.dump(response.json(), f)
            
            print(f"Done downloading {url}.")
        else:
            print(f"Failed to download {url}. Status code: {response.status_code}")

    else:
        print(f"Reading from {filename}...")

    return filename

In [5]:
def download_and_clean_311_data(download = True):
    '''
    https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9
    
    The complete data has the following columns:
    ['location_state', 'facility_type', 'intersection_street_2', 'city',
    'location_zip', 'park_borough', 'latitude', 'road_ramp', 'created_date',
    'agency', 'park_facility_name', 'location_address', 'agency_name',
    'descriptor', 'bbl', 'location_city', 'open_data_channel_type',
    'cross_street_2', 'bridge_highway_direction', 'longitude',
    'bridge_highway_segment', 'street_name', 'incident_address',
    'address_type', 'incident_zip', 'unique_key', 'complaint_type',
    'y_coordinate_state_plane', 'status', 'bridge_highway_name',
    'location_type', 'due_date', 'taxi_company_borough',
    'taxi_pick_up_location', 'x_coordinate_state_plane',
    'resolution_description', 'community_board',
    'resolution_action_updated_date', 'intersection_street_1',
    'closed_date', 'vehicle_type', 'cross_street_1', 'borough', 'landmark',
    'location']
       
    I only downloaded `created_date`, `descriptor`, `incident_zip`, `location` column from the database 
    where = created_date > '2022-01-01T00:00:00'.
    '''    
    # URL for getting the JSON file
    url = "https://data.cityofnewyork.us/resource/" + NYC_DATA_311+ "?$$app_token="+ NYC_DATA_APP_TOKEN + \
    "&$select= created_date, descriptor, incident_zip AS zipcode, location &$where=created_date>'2022-01-01T00:00:00' \
    &$limit=1000"
    
    # download and save the data
    if download: 
        download_nyc_geojson_data(url, force=True)
    
    # load the data.
    geodf_311_data = gpd.read_file(DATA_311_FILE)
    
    # drop rows where there is no zipcode
    geodf_311_data = geodf_311_data.dropna(subset=['zipcode'])
    
    #return the data object.
    return geodf_311_data

In [6]:
def download_and_clean_tree_data(download = True):
    '''
    https://dev.socrata.com/foundry/data.cityofnewyork.us/5rq2-4hqu
    
    The complete data has the following columns:
    ['nta', 'health', 'zipcode', 'latitude', 'nta_name', 'state',
    'trnk_wire', 'y_sp', 'brnch_othe', 'root_grate', 'tree_id', 'steward',
    'spc_common', 'trnk_other', 'x_sp', 'brnch_ligh', 'problems',
    'longitude', 'boro_ct', 'zip_city', 'spc_latin', 'stump_diam',
    'boroname', 'st_senate', 'user_type', 'status', 'brnch_shoe',
    'curb_loc', 'cncldist', 'guards', 'st_assem', 'cb_num', 'address',
    'sidewalk', 'root_other', 'created_at', 'borocode', 'block_id',
    'trnk_light', 'tree_dbh', 'root_stone', 'the_geom']
       
    I only downloaded `zipcode` and `the_geom` column from the database.
    '''
    # URL for getting the JSON file
    url = "https://data.cityofnewyork.us/resource/" + NYC_DATA_TREES+ "?$$app_token="+ NYC_DATA_APP_TOKEN + \
    "&$select= zipcode,the_geom&$limit=1000"
    # download and save the data
    if download:
        download_nyc_geojson_data(url, force=True)
    
    # load and the data.
    geodf_tree_data = gpd.read_file(DATA_TREE_FILE)
    
    # drop rows where there is no zipcode
    geodf_tree_data = geodf_tree_data.dropna(subset=['zipcode'])

    #return the data object.
    return geodf_tree_data 

In [7]:
def load_and_clean_zipcodes(zipcode_datafile):
    '''
    `zipcode_datafile` has following columns:
    ['ZIPCODE', 'BLDGZIP', 'PO_NAME', 'POPULATION', 'AREA', 'STATE', 'COUNTY', 
     'ST_FIPS', 'CTY_FIPS', 'URL', 'SHAPE_AREA', 'SHAPE_LEN', 'geometry']    
    '''
    
    geodf_zipcode_data = gpd.read_file(zipcode_datafile)
    
    column_names = geodf_zipcode_data.columns
    # change all column names to lowercase.
    geodf_zipcode_data.columns = [element.lower() for element in column_names]
    
    # drop unnecessary columns
    geodf_zipcode_data = geodf_zipcode_data.drop(columns = ["bldgzip","shape_area","shape_len","url"])
    
    return geodf_zipcode_data

In [8]:
def load_and_clean_zillow_data(zillow_datafile):
    df_zillow_data = pd.read_csv(zillow_datafile)
    
    # rename `RegionName` as `zipcode`
    df_zillow_data.rename(columns={'RegionName': 'zipcode'}, inplace=True)
    
    # drop unnecessary columns
    df_zillow_data = df_zillow_data.drop(columns = ["RegionID","SizeRank","RegionType","StateName"])
    
    return df_zillow_data

In [9]:
# If download == True, the program will download the data from the internet and cover the old data.
# For the first time runing this program, turn it True.
def load_all_data(download = True):
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data(download)
    geodf_tree_data = download_and_clean_tree_data(download)
    df_zillow_data = load_and_clean_zillow_data(ZILLOW_DATA_FILE)
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [10]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data(True)

Downloading https://data.cityofnewyork.us/resource/erm2-nwe9.geojson?$$app_token=JkEVszrAdzm7qGLzLjvQVBnnM&$select= created_date, descriptor, incident_zip AS zipcode, location &$where=created_date>'2022-01-01T00:00:00'     &$limit=1000 to data\resource\erm2-nwe9.geojson...
Done downloading https://data.cityofnewyork.us/resource/erm2-nwe9.geojson?$$app_token=JkEVszrAdzm7qGLzLjvQVBnnM&$select= created_date, descriptor, incident_zip AS zipcode, location &$where=created_date>'2022-01-01T00:00:00'     &$limit=1000.
Downloading https://data.cityofnewyork.us/resource/5rq2-4hqu.geojson?$$app_token=JkEVszrAdzm7qGLzLjvQVBnnM&$select= zipcode,the_geom&$limit=1000 to data\resource\5rq2-4hqu.geojson...
Done downloading https://data.cityofnewyork.us/resource/5rq2-4hqu.geojson?$$app_token=JkEVszrAdzm7qGLzLjvQVBnnM&$select= zipcode,the_geom&$limit=1000.


In [11]:
for row in geodf_zipcode_data.iterrows():
    print(row)

(0, zipcode                                                   11436
po_name                                                 Jamaica
population                                              18681.0
area                                            22699295.459415
state                                                        NY
county                                                   Queens
st_fips                                                      36
cty_fips                                                    081
geometry      POLYGON ((1038098.2518714815 188138.3800067156...
Name: 0, dtype: object)
(1, zipcode                                                   11213
po_name                                                Brooklyn
population                                              62426.0
area                                            29631004.437939
state                                                        NY
county                                                    Kings
st_fips 

In [12]:
geodf_zipcode_data

Unnamed: 0,zipcode,po_name,population,area,state,county,st_fips,cty_fips,geometry
0,11436,Jamaica,18681.0,2.269930e+07,NY,Queens,36,081,"POLYGON ((1038098.252 188138.380, 1038141.936 ..."
1,11213,Brooklyn,62426.0,2.963100e+07,NY,Kings,36,047,"POLYGON ((1001613.713 186926.440, 1002314.243 ..."
2,11212,Brooklyn,83866.0,4.197210e+07,NY,Kings,36,047,"POLYGON ((1011174.276 183696.338, 1011373.584 ..."
3,11225,Brooklyn,56527.0,2.369863e+07,NY,Kings,36,047,"POLYGON ((995908.365 183617.613, 996522.848 18..."
4,11218,Brooklyn,72280.0,3.686880e+07,NY,Kings,36,047,"POLYGON ((991997.113 176307.496, 992042.798 17..."
...,...,...,...,...,...,...,...,...,...
258,10310,Staten Island,25003.0,5.346328e+07,NY,Richmond,36,085,"POLYGON ((950767.507 172848.969, 950787.510 17..."
259,11693,Far Rockaway,11052.0,3.497516e+06,NY,Kings,36,047,"POLYGON ((1028453.995 167153.410, 1027813.010 ..."
260,11249,Brooklyn,28481.0,1.777221e+07,NY,Kings,36,047,"POLYGON ((995877.318 203206.075, 995968.511 20..."
261,10162,New York,0.0,2.103489e+04,NY,New York,36,061,"POLYGON ((997731.761 219560.922, 997641.948 21..."


In [13]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   zipcode     263 non-null    object  
 1   po_name     263 non-null    object  
 2   population  263 non-null    float64 
 3   area        263 non-null    float64 
 4   state       263 non-null    object  
 5   county      263 non-null    object  
 6   st_fips     263 non-null    object  
 7   cty_fips    263 non-null    object  
 8   geometry    263 non-null    geometry
dtypes: float64(2), geometry(1), object(6)
memory usage: 18.6+ KB


In [14]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

Unnamed: 0,zipcode,po_name,population,area,state,county,st_fips,cty_fips,geometry
0,11436,Jamaica,18681.0,22699300.0,NY,Queens,36,81,"POLYGON ((1038098.252 188138.380, 1038141.936 ..."
1,11213,Brooklyn,62426.0,29631000.0,NY,Kings,36,47,"POLYGON ((1001613.713 186926.440, 1002314.243 ..."
2,11212,Brooklyn,83866.0,41972100.0,NY,Kings,36,47,"POLYGON ((1011174.276 183696.338, 1011373.584 ..."
3,11225,Brooklyn,56527.0,23698630.0,NY,Kings,36,47,"POLYGON ((995908.365 183617.613, 996522.848 18..."
4,11218,Brooklyn,72280.0,36868800.0,NY,Kings,36,47,"POLYGON ((991997.113 176307.496, 992042.798 17..."


In [15]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 993 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   created_date  993 non-null    datetime64[ns]
 1   descriptor    993 non-null    object        
 2   zipcode       993 non-null    object        
 3   geometry      983 non-null    geometry      
dtypes: datetime64[ns](1), geometry(1), object(2)
memory usage: 38.8+ KB


In [16]:
geodf_311_data.head()

Unnamed: 0,created_date,descriptor,zipcode,geometry
0,2023-12-04 01:32:00,Banging/Pounding,10027,POINT (-73.94391 40.81288)
1,2023-12-04 01:31:34,Loud Music/Party,10304,POINT (-74.08298 40.61289)
2,2023-12-04 01:30:28,Other,11201,POINT (-73.99771 40.68768)
3,2023-12-04 01:30:14,Other,11357,POINT (-73.80174 40.78972)
5,2023-12-04 01:29:17,Banging/Pounding,10033,POINT (-73.93846 40.84853)


In [17]:
geodf_tree_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   zipcode   1000 non-null   object  
 1   geometry  1000 non-null   geometry
dtypes: geometry(1), object(1)
memory usage: 15.8+ KB


In [18]:
geodf_tree_data.head()

Unnamed: 0,zipcode,geometry
0,11375,POINT (-73.84422 40.72309)
1,11357,POINT (-73.81868 40.79411)
2,11211,POINT (-73.93661 40.71758)
3,11211,POINT (-73.93446 40.71354)
4,11215,POINT (-73.97598 40.66678)


In [19]:
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6722 entries, 0 to 6721
Columns: 110 entries, zipcode to 2023-09-30
dtypes: float64(105), int64(1), object(4)
memory usage: 5.6+ MB


In [20]:
df_zillow_data.head()

Unnamed: 0,zipcode,State,City,Metro,CountyName,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,2015-06-30,2015-07-31,2015-08-31,2015-09-30,2015-10-31,2015-11-30,2015-12-31,2016-01-31,2016-02-29,2016-03-31,2016-04-30,2016-05-31,2016-06-30,2016-07-31,2016-08-31,2016-09-30,2016-10-31,2016-11-30,2016-12-31,2017-01-31,2017-02-28,2017-03-31,2017-04-30,2017-05-31,2017-06-30,2017-07-31,2017-08-31,2017-09-30,2017-10-31,2017-11-30,2017-12-31,2018-01-31,2018-02-28,2018-03-31,2018-04-30,2018-05-31,2018-06-30,2018-07-31,2018-08-31,2018-09-30,2018-10-31,2018-11-30,2018-12-31,2019-01-31,2019-02-28,2019-03-31,2019-04-30,2019-05-31,2019-06-30,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30,2020-05-31,2020-06-30,2020-07-31,2020-08-31,2020-09-30,2020-10-31,2020-11-30,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30,2021-07-31,2021-08-31,2021-09-30,2021-10-31,2021-11-30,2021-12-31,2022-01-31,2022-02-28,2022-03-31,2022-04-30,2022-05-31,2022-06-30,2022-07-31,2022-08-31,2022-09-30,2022-10-31,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31,2023-09-30
0,77494,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,1606.206406,1612.779844,1622.201575,1630.392427,1632.4115,1636.206864,1644.894632,1643.390762,1636.971443,1620.756709,1613.330035,1595.875303,1587.956334,1571.722899,1580.226011,1577.360511,1575.019249,1570.437252,1560.570981,1559.47386,1550.038637,1532.579831,1510.242213,1486.489733,1507.802059,1533.972404,1560.7754,1560.921517,1569.89826,1578.146822,1578.913743,1567.92173,1577.355514,1590.264504,1606.49932,1604.421318,1611.899807,1618.293318,1621.761808,1617.780728,1614.801931,1615.532634,1611.448067,1601.793861,1590.63871,1590.488945,1594.393947,1598.27526,1604.520456,1610.434841,1621.77068,1625.351916,1626.773326,1625.936662,1624.890088,1633.134397,1625.910897,1629.721928,1626.974655,1636.612419,1640.770934,1647.007094,1651.109647,1657.09101,1649.138584,1645.635867,1637.868714,1649.266605,1662.133844,1665.130638,1674.053593,1679.23915,1688.863037,1690.82975,1692.870693,1713.964591,1744.244031,1792.32527,1827.718052,1856.436987,1872.532636,1884.990347,1910.473392,1917.549543,1930.611286,1926.224851,1936.273593,1950.552503,1975.971322,1999.610405,2023.300973,2024.379276,2018.99937,2006.725802,1990.684558,1994.653463,2027.438438,2042.237444,2049.325559,2016.531345,2023.438976,2031.558202,2046.144009,2053.486247,2055.771355
1,77449,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,1257.81466,1255.268025,1262.170452,1274.955754,1285.526052,1295.665673,1296.650395,1300.868081,1301.898486,1302.881427,1299.69312,1296.038652,1288.469114,1287.887883,1296.983247,1310.096909,1316.314107,1308.568281,1300.912474,1295.072347,1292.6691,1288.731946,1279.219942,1270.826144,1270.159851,1280.804521,1290.962297,1297.787179,1299.429207,1305.319483,1313.028808,1314.471319,1318.936586,1318.360343,1325.90422,1327.061006,1338.24246,1342.947774,1346.339152,1347.977879,1353.781015,1360.934194,1361.964965,1362.711392,1358.947439,1356.220524,1356.988632,1353.259235,1350.062265,1348.472443,1354.870417,1359.648555,1367.652345,1372.060549,1380.044029,1377.19268,1378.572673,1375.046953,1372.52441,1373.553574,1381.286565,1390.962381,1402.096689,1398.546295,1390.275169,1390.30116,1403.523253,1422.490142,1432.232355,1431.127611,1437.327132,1448.187834,1458.760896,1465.848888,1471.032905,1490.807492,1527.079852,1565.896231,1605.220723,1614.231294,1619.765081,1631.133342,1636.650862,1656.26201,1650.7459,1677.114958,1697.24042,1720.421368,1698.930423,1697.851631,1735.015389,1802.088342,1802.92843,1790.160022,1752.95085,1749.6979,1738.217986,1747.30584,1758.407295,1758.891075,1762.980879,1771.751591,1779.338402,1795.384582,1799.63114
2,77084,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,,,,,,,,,,,,,,,,,1258.992453,1245.886191,1250.536501,1260.4657,1265.29396,1262.405513,1256.426032,1254.017929,1251.279571,1252.41533,1265.417047,1269.946742,1278.865848,1281.32299,1288.139658,1290.218035,1306.162183,1317.66513,1328.73002,1318.40446,1314.769717,1310.449048,1309.677326,1317.968447,1322.516778,1331.34293,1326.393494,1331.231751,1334.985286,1333.47968,1341.995595,1341.38147,1345.813983,1332.54079,1343.33446,1356.184704,1375.943091,1374.238193,1377.990761,1376.000125,1372.755567,1366.611062,1367.69657,1373.615421,1377.077377,1381.897488,1390.230067,1389.363898,1392.952961,1397.44194,1408.913227,1415.327187,1424.39311,1436.157314,1431.401888,1430.485593,1427.534174,1435.367022,1441.842435,1454.142026,1491.939426,1523.871978,1574.288429,1619.443497,1653.536117,1649.885981,1633.710237,1637.96809,1654.718249,1672.401556,1680.802806,1694.173809,1710.491724,1729.026562,1744.102197,1726.403795,1722.541038,1701.065668,1710.787785,1701.21752,1706.900064,1706.067787,1723.72232,1735.48467,1752.132904,1756.990323,1754.429516,1757.602011,1755.03149
3,79936,TX,El Paso,"El Paso, TX",El Paso County,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1210.87192,1213.69536,1218.649801,1221.891992,1248.921013,1268.808951,1296.674594,1316.499497,1306.783912,1300.582784,1307.339213,1328.543901,1353.34843,1361.390437,1390.876445,1407.381591,1427.46541,1409.828076,1417.572363,1419.480272,1458.063897,1471.726681,1466.734658,1456.17566,1462.478506,1466.267391,1490.237063,1488.180414,1494.366097
4,11385,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,,2087.527084,,2149.924252,2166.263698,2148.992886,2190.098591,2264.966715,2297.900917,2267.061705,2225.89006,2188.287674,2189.347039,2255.283771,2267.36975,2296.999953,2272.544134,2282.492009,2313.009451,2334.460178,2378.910449,2397.294201,2357.959209,2338.575627,2319.170887,2344.361312,2345.076633,2333.906767,2343.395747,2332.22853,2351.64316,2346.760615,2391.448235,2366.975375,2376.814118,2349.812499,2340.667651,2309.494964,2311.390228,2326.918687,2358.472429,2367.456439,2382.384671,2386.107852,2401.525193,2419.898399,2420.804474,2420.254366,2400.882444,2419.392453,2407.537302,2422.886151,2405.459043,2456.568393,2484.302054,2500.881705,2489.493771,2483.15143,2506.192072,2503.48836,2508.742408,2534.247263,2530.346246,2503.77244,2488.180225,2480.87099,2501.785904,2449.236462,2418.685742,2390.021781,2369.10735,2340.768882,2327.381365,2311.684517,2319.512276,2301.040323,2336.068945,2372.07687,2424.346279,2465.115046,2511.387795,2562.603143,2590.907331,2611.770396,2615.215008,2619.285781,2627.976369,2676.179398,2745.586931,2847.533756,2911.446285,2985.851427,3003.59857,3015.214534,2978.853757,2935.80822,2895.699421,2873.209025,2881.906361,2913.546218,2963.964134,3005.735342,3034.413822,3064.476503,3079.585783


## Part 2: Storing Data

In [21]:
!createdb NYC_database

Before database creation
^C
After database creation


In [22]:
!psql --dbname NYC_database -c 'CREATE EXTENSION postgis;'

^C


### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [23]:
username = "postgres"
password = "Km1987865190"
DB_URL = f"postgresql://{username}:{password}@localhost:5432/NYC_database"
engine = db.create_engine(DB_URL)

#### SQLAlchemy

In [24]:
from sqlalchemy import Column, Integer, String, Float, DateTime
from geoalchemy2 import Geometry
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine

Base = declarative_base()

class Zipcode(Base):
    __tablename__ = "zipcodes"

    id = Column(Integer,primary_key=True)
    zipcode = Column(Integer)
    po_name = Column(String)
    population = Column(Float)
    area = Column(Float)
    state = Column(String)
    county = Column(String)
    st_fips = Column(Integer)
    cty_fips = Column(String)
    geometry = Column(Geometry("Point",srid=4326))

class Complaint(Base):
    __tablename__ = "complaints"

    id = Column(Integer,primary_key=True)
    created_date = Column(DateTime)
    descriptor = Column(String)
    zipcode = Column(Integer)
    geometry = Column(Geometry("Point",srid=4326))

class Tree(Base):
    __tablename__ = "trees"

    id = Column(Integer,primary_key=True)
    zipcode = Column(Integer)
    geometry = Column(Geometry("Point",srid=4326))

class Zillow(Base):
    __tablename__ = "houses"

    id = Column(Integer,primary_key=True)
    zipcode = Column(Integer)
    state = Column(String)
    city = Column(String)
    metro = Column(String)
    county = Column(String)
    rent = Column(Float)

with open("schema.sql","w") as file:
    file.write(Base.metadata.create_all(bind=create_engine(DB_URL)))

OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: 致命错误:  数据库 "NYC_database" 不存在

(Background on this error at: https://sqlalche.me/e/14/e3q8)

SQLAlchemy

In [None]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [4]:
for row in geodf_zipcode_data.iterrows():
    zipcode = Zipcode(zipcode=row[1]["zipcode"],
                      po_name=row[1]["po_name"],
                      population=row[1]["population"],
                      area=row[1]["area"],
                      state=row[1]["state"],
                      county=row[1]["county"],
                      st_fips=row[1]["st_fips"],
                      cty_fips=row[1]["cty_fips"],
                      geometry=row[1]["geometry"]
                      )
    session.add(zipcode)
for row in geodf_311_data.iterrows():
    complaint = Complaint(created_date=row[1]["created_date"],
                          descriptor=row[1]["descriptor"],
                          zipcode=row[1]["zipcode"],
                          geometry=row[1]["geometry"]
                          )
    session.add(complaint)
for row in geodf_tree_data.iterrows():
    tree = Tree(zipcode=row[1]["zipcode"],
                geometry=row[1]["geometry"]
                )
    session.add(tree)
for row in df_zillow_data.iterrows():
    house = Zillow(zipcode=row[1]["zipcode"],
                   state=row[1]["State"],
                   city=row[1]["City"],
                   metro=row[1]["Metro"],
                   county=row[1]["CountyName"],
                   rent = row[1]["rent"]
                   )
    session.add(house)

NameError: name 'geodf_zipcode_data' is not defined

In [None]:
session.commit()

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    with open (outfile,"w") as file:
        file.write(query)

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
SELECT
    z.zipcode,
    COUNT(c.id) AS number_of_complaints
FROM
    zipcodes z
LEFT JOIN
    complaints c ON z.zipcode = c.zipcode
WHERE
    c.created_date >= "2022-10-01" AND c.created_date <= "2023-09-30"
GROUP BY
    z.zipcodes
ORDER BY
    number_of_complaints DESC;
"""

In [None]:
QUERY_2_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
SELECT
    zipcode,
    COUNT(*) AS total_trees
FROM
    trees
GROUP BY
    zipcode
ORDER BY
    total_trees DESC
LIMIT 10;
"""

In [None]:
QUERY_3_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_3 = """
SELECT
    t.zipcode,
    FORMAT(AVG(r.rent),2) AS average_rent
FROM
    trees t
JOIN
    rents r ON t.zipcode = r.zipcode
WHERE
    EXTRACT(MONTH FROM r.date) = 8 AND EXTRACT(YEAR FROM r.date) = 2023
GROUP BY
    t.zipcode
ORDER BY
    COUNT(*) DESC
LIMIT 10;
"""

In [None]:
QUERY_4_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_4 = """
WITH RentTreeCounts AS (
    SELECT
        r.zipcode,
        FORMAT(AVG(r.rent), 2) AS average_rent,
        COUNT(t.zipcode) AS tree_count
    FROM
        rents r
    LEFT JOIN
        trees t ON r.zipcode = t.zipcode
    WHERE
        EXTRACT(MONTH FROM r.date) = 1 AND EXTRACT(YEAR FROM r.date) = 2023
    GROUP BY
        r.zipcode
),
ComplaintCounts AS (
    SELECT
        c.zipcode,
        COUNT(*) AS complaint_count
    FROM
        complaints c
    WHERE
        EXTRACT(MONTH FROM c.created_date) = 1 AND EXTRACT(YEAR FROM c.created_date) = 2023
    GROUP BY
        c.zipcode
)

SELECT
    rtc.zipcode,
    rtc.average_rent,
    rtc.tree_count,
    COALESCE(cc.complaint_count, 0) AS complaint_count
FROM
    RentTreeCounts rtc
LEFT JOIN
    ComplaintCounts cc ON rtc.zipcode = cc.zipcode

ORDER BY
    CAST(rtc.average_rent AS DECIMAL(10,2)) ASC, -- Sorting by rent in ascending order
    rtc.zipcode
LIMIT 5

UNION ALL

SELECT
    rtc.zipcode,
    rtc.average_rent,
    rtc.tree_count,
    COALESCE(cc.complaint_count, 0) AS complaint_count
FROM
    RentTreeCounts rtc
LEFT JOIN
    ComplaintCounts cc ON rtc.zipcode = cc.zipcode

ORDER BY
    CAST(rtc.average_rent AS DECIMAL(10,2)) DESC, -- Sorting by rent in descending order
    rtc.zipcode
LIMIT 5;
"""

In [None]:
QUERY_5_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_5 = """
SELECT
    z.zipcode,
    COUNT(t.id) AS total_trees
FROM
    zipcodes z
JOIN
    trees t ON ST_Within(t.geometry, z.geometry)
GROUP BY
    z.zipcode
ORDER BY
    total_trees DESC
LIMIT 10;
"""

In [None]:
QUERY_6_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_6 = """
WITH Point AS (
    SELECT ST_SetSRID(ST_MakePoint(-73.96253174434912, 40.80737875669467), 4326) AS geom
)

SELECT
    id,
    species,
    health,
    status,
    ST_AsText(geometry) AS coordinate_location
FROM
    trees
WHERE
    ST_DWithin(geometry, (SELECT geom FROM Point), 804.672);
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)