# Tools for Analytics Final Project ###

Jaeseop Shin / js6364  &. Hyunjin Jun / hj2642

## Environment Setup

In [1]:
# Importing all libraries used for the project

import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import geopandas as gpd
import requests
import psycopg2
import pathlib


from bs4 import BeautifulSoup
from sodapy import Socrata
from sqlalchemy import create_engine
from shapely.geometry import Point

# SQL Data
db_username = "user_name"
db_name = "name"
db_url = f"postgres+psycopg2://{db_username}@localhost/{db_name}"
db_schema = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1. Data Processing

### Data Setup

In [3]:
app_token = "2Hn2wwabCLXVYhGN4b9tEtJ11"


# Downloading NYC 311 data
url_311 = "data.cityofnewyork.us"
set_311 = "erm2-nwe9"
client_311 = Socrata(url_311, app_token)
client_311.timeout = 60
# Filtering data after 2015
where_311 = "date_extract_y(created_date)>=2015"
# Data to data frame
data_311 = client_311.get(set_311, where=where_311, limit=20)
df_311 = pd.DataFrame.from_records(data_311)
#df_311.to_csv("nyc_311.csv")

# Downloading NYC Tree data
url_tree = "data.cityofnewyork.us"
set_tree = "uvpi-gqnh"
client_tree = Socrata(url_tree, app_token)
client_tree.timeout = 60
# Data to data frame
data_tree = client_tree.get(set_tree, limit=20)
df_tree = pd.DataFrame.from_records(data_tree)
#df_tree.to_csv("nyc_tree.csv")

# Loading local data files
data_dir = pathlib.Path("data")
rent_path = data_dir / "zillow_rent_data.csv"          
df_rent = pd.read_csv(rent_path)
nyc_zipcodes_shp_path = data_dir / "nyc_zipcodes.shp"
gdf_zipcode = gpd.read_file(nyc_zipcodes_shp_path)
gdf_zipcode = gdf_zipcode.to_crs("EPSG:4326")

In [None]:
df_311.head()

### Data Cleaning & Filtering 

In [4]:
def clean_filter(data_frame, column_use, column_rename, column_type):
    """Removes unnecessary columns, rename columns, confirm types of columns, discard invalid data"""
    data_frame = data_frame[column_use]
    data_frame = data_frame.rename(columns=column_rename)
    for column, dtype in column_type.items():
        if dtype == 'datetime':
            data_frame[column] = pd.to_datetime(data_frame[column], errors='coerce')
        else:
            data_frame[column] = data_frame[column].astype(dtype, errors='raise')
    data_frame.dropna(inplace=True)
    return data_frame

In [11]:
# NYC 311 Cleaning & Filtering
# Columns to remove
use_311 = [
    'unique_key',
    'created_date',
    'complaint_type',
    'incident_zip',
    'city',
    'x_coordinate_state_plane',
    'y_coordinate_state_plane',
    'latitude',
    'longitude',
]
# Columns to rename
rename_311 = {
    'unique_key': 'complaint_id',
    'created_date': 'date',
    'complaint_type': 'complaint_type',
    'incident_zip': 'zipcode',
    'city': 'city',
    'x_coordinate_state_plane': 'x_coord',
    'y_coordinate_state_plane': 'y_coord',
    'latitude': 'latitude',
    'longitude': 'longitude',
}
# Column types
type_311 = {
    'complaint_id': 'int',
    'date': 'datetime',
    'complaint_type': 'str',
    'zipcode': 'int',
    'city': 'str',
    'x_coord': 'int',
    'y_coord': 'int',
    'latitude': 'float',
    'longitude': 'float',
}

df_311_f = clean_filter(df_311, use_311, rename_311, type_311)

# Converting latitude and longitude into Geometry data
df_311_f['geometry'] = df_311_f.apply(lambda x: Point(float(x['longitude']), float(x['latitude'])), axis=1)

# Converting DataFrame to GeoDataFrame
gdf_311_f = gpd.GeoDataFrame(df_311_f, geometry='geometry')

# Setting the coordinate reference system
gdf_311_f.set_crs(epsg=4326, inplace=True)
gdf_311_f.head()

Unnamed: 0,complaint_id,date,complaint_type,zipcode,city,x_coord,y_coord,latitude,longitude,geometry
0,59489092,2023-11-19 12:00:00,Derelict Vehicles,10467,BRONX,1019674,259069,40.877688,-73.871904,POINT (-73.87190 40.87769)
1,59481977,2023-11-19 12:00:00,Derelict Vehicles,11429,QUEENS VILLAGE,1056274,199662,40.71441,-73.740192,POINT (-73.74019 40.71441)
2,59486111,2023-11-19 12:00:00,Derelict Vehicles,11213,BROOKLYN,1003596,181620,40.665161,-73.930266,POINT (-73.93027 40.66516)
3,59487110,2023-11-19 01:22:30,Encampment,10002,NEW YORK,987657,201406,40.719489,-73.987709,POINT (-73.98771 40.71949)
4,59481953,2023-11-19 01:22:20,Noise - Residential,11368,CORONA,1024433,212932,40.751034,-73.854971,POINT (-73.85497 40.75103)


In [12]:
# NYC Tree Cleaning & Filtering
# Columns to remove
use_tree = [
    'tree_id',
    'status',
    'health',
    'spc_common',
    'zipcode',
    'zip_city',
    'latitude',
    'longitude',
    'x_sp',
    'y_sp',
]
# Columns to rename
rename_tree = {
    'tree_id': 'tree_id',
    'status': 'status',
    'health': 'health',
    'spc_common': 'species',
    'zipcode': 'zipcode',
    'zip_city': 'city',
    'latitude': 'latitude',
    'longitude': 'longitude',
    'x_sp': 'x_coord',
    'y_sp': 'y_coord',
}
# Column types
type_tree = {
    'tree_id': 'int',
    'status': 'str',
    'health': 'str',
    'species': 'str',
    'zipcode': 'int',
    'city': 'str',
    'latitude': 'float',
    'longitude': 'float',
    'x_coord': 'float',
    'y_coord': 'float',
}

df_tree_f = clean_filter(df_tree, use_tree, rename_tree, type_tree)

# Converting latitude and longitude into Geometry data
df_tree_f['geometry'] = df_tree_f.apply(lambda x: Point(float(x['longitude']), float(x['latitude'])), axis=1)

# Converting DataFrame to GeoDataFrame
gdf_tree_f = gpd.GeoDataFrame(df_tree_f, geometry='geometry')

# Setting the coordinate reference system
gdf_tree_f.set_crs(epsg=4326, inplace=True)
gdf_tree_f.head()

Unnamed: 0,tree_id,status,health,species,zipcode,city,latitude,longitude,x_coord,y_coord,geometry
0,180683,Alive,Fair,red maple,11375,Forest Hills,40.723092,-73.844215,1027431.148,202756.7687,POINT (-73.84422 40.72309)
1,200540,Alive,Fair,pin oak,11357,Whitestone,40.794111,-73.818679,1034455.701,228644.8374,POINT (-73.81868 40.79411)
2,204026,Alive,Good,honeylocust,11211,Brooklyn,40.717581,-73.936608,1001822.831,200716.8913,POINT (-73.93661 40.71758)
3,204337,Alive,Good,honeylocust,11211,Brooklyn,40.713537,-73.934456,1002420.358,199244.2531,POINT (-73.93446 40.71354)
4,189565,Alive,Good,American linden,11215,Brooklyn,40.666778,-73.975979,990913.775,182202.426,POINT (-73.97598 40.66678)


In [13]:
# Zillow Rent Cleaning & Filtering
# Columns to remove
remove_rent = [
    'RegionID',
    'SizeRank',
    'RegionType',
    'StateName',
    'Metro',
    'CountyName',
]
# Columns to rename
rename_rent = {
    'RegionName': 'zipcode',
    'State': 'state',
    'City': 'city',
}
# Column types
type_rent = {
    'zipcode': 'int',
    'state': 'str',
    'city': 'str'
}

# Removing and renaming columns
df_rent_f = df_rent.drop(columns=remove_rent, axis=1)
df_rent_f = df_rent_f.rename(columns=rename_rent)
# Confirming Data type
for column, dtype in type_rent.items():
    df_rent_f[column] = df_rent_f[column].astype(dtype, errors='raise')
for column in df_rent_f.iloc[:, 3:]:
    df_rent_f[column] = df_rent_f[column].astype('float', errors='raise')
# Discarding blank data
df_rent_f.dropna(inplace=True)
df_rent_f.head()

Unnamed: 0,zipcode,state,city,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,2015-06-30,2015-07-31,...,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31,2023-09-30
0,77494,TX,Katy,1606.206406,1612.779844,1622.201575,1630.392427,1632.4115,1636.206864,1644.894632,...,1994.653463,2027.438438,2042.237444,2049.325559,2016.531345,2023.438976,2031.558202,2046.144009,2053.486247,2055.771355
1,77449,TX,Katy,1257.81466,1255.268025,1262.170452,1274.955754,1285.526052,1295.665673,1296.650395,...,1749.6979,1738.217986,1747.30584,1758.407295,1758.891075,1762.980879,1771.751591,1779.338402,1795.384582,1799.63114
8,77433,TX,Cypress,1332.384333,1328.107408,1334.32529,1342.507107,1343.204774,1349.345048,1357.258039,...,1881.20455,1885.695935,1884.894986,1880.532012,1870.035369,1863.111029,1892.511066,1922.759295,1945.581823,1975.672556
15,11226,NY,New York,1944.609891,1971.608676,2044.189151,2061.734126,2076.492259,2057.344849,2075.751863,...,2695.164727,2680.6837,2676.791284,2697.414702,2706.080701,2726.381017,2753.966159,2770.403005,2785.320137,2762.435713
17,30044,GA,Lawrenceville,1157.569878,1168.554459,1173.937503,1176.423247,1180.259047,1183.618022,1193.861072,...,2125.698496,2113.96425,2170.556028,2160.008767,2191.894968,2181.78056,2161.711011,2117.99072,2119.319392,2055.476496


In [14]:
# Zipcode Cleaning & Filtering
# Columns to remove
use_gdf = [
    'ZIPCODE',
    'PO_NAME',
    'STATE',
    'COUNTY',
    'geometry',
]
# Columns to rename
rename_gdf = {
    'ZIPCODE': 'zipcode',
    'PO_NAME': 'city',
    'STATE': 'state',
    'COUNTY': 'county',
    'geometry': 'geometry',
}
# Column types
type_gdf = {
    'zipcode': 'int',
    'city': 'str',
    'state': 'str',
    'county': 'str',
    'geometry': 'geometry',
}

gdf_zipcode_f = clean_filter(gdf_zipcode, use_gdf, rename_gdf, type_gdf)
gdf_zipcode_f.head()

Unnamed: 0,zipcode,city,state,county,geometry
0,11436,Jamaica,NY,Queens,"POLYGON ((-73.80585 40.68291, -73.80569 40.682..."
1,11213,Brooklyn,NY,Kings,"POLYGON ((-73.93740 40.67973, -73.93487 40.679..."
2,11212,Brooklyn,NY,Kings,"POLYGON ((-73.90294 40.67084, -73.90223 40.668..."
3,11225,Brooklyn,NY,Kings,"POLYGON ((-73.95797 40.67066, -73.95576 40.670..."
4,11218,Brooklyn,NY,Kings,"POLYGON ((-73.97208 40.65060, -73.97192 40.650..."


## Part 2. Storing Data

In [None]:
def setup_new_postgis_database(user_name, db_name):
    """Setting up a new PostGis Database"""
    os.system(f"createdb -U {user_name} {db_name}")
    os.system(f"psql -U {user_name} --dbname {db_name} -c 'CREATE EXTENSION postgis;'")

In [None]:
setup_new_postgis_database(db_username, db_name)

### Creating Tables

In [None]:
engine = create_engine(db_url)

In [None]:
# Defining the SQL statements to create 311, tree, rent, and zipcode tables
nyc_311_schema = """
CREATE TABLE nyc_311 (
    complaint_id INTEGER PRIMARY KEY,
);
CREATE INDEX idx_nyc_311_geom ON nyc_311 USING GIST (geom_column);
"""

nyc_tree_schema = """
TODO
"""

nyc_rent_schema = """
TODO
"""

nyc_zipcode_schema = """
TODO
"""

In [None]:
# creating the required schema.sql file
with open(db_schema, "w") as f:
    f.write(nyc_311_schema)
    f.write(nyc_tree_schema)
    f.write(nyc_rent_schema)
    f.write(nyc_zipcode_schema)

In [None]:
# Executing the schema files to create tables
with engine.connect() as connection:
    pass

### Adding Data to Database

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    """Inserting dataframe into SQL tables"""
    with engine.connect() as connection:
        for table_name, df in tablename_to_dataframe.items():
            df.to_sql(table_name, con=connection, if_exists='replace', index=False)


In [None]:
# Dataframe to Table mapping
tablename_to_dataframe = {
    "nyc_zipcodes": gdf_zipcode_f,
    "311_complaints": gdf_311_f,
    "nyc_trees": gdf_tree_f,
    "nyc_rents": df_rent_f,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)