# Tools for Analytics Final Project ###

Jaeseop Shin / js6364  &. Hyunjin Jun / hj2642

## Environment Setup

In [1]:
# Importing all libraries used for the project

import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import geopandas as gpd
import requests
import psycopg2



from bs4 import BeautifulSoup
from sodapy import Socrata
from sqlalchemy import create_engine

# SQL Data
db_username = ""

## Part 1. Data Processing

### Data Setup

In [2]:
app_token = "2Hn2wwabCLXVYhGN4b9tEtJ11"

# Downloading NYC 311 data
url_311 = "data.cityofnewyork.us"
set_311 = "erm2-nwe9"
client_311 = Socrata(url_311, app_token)
client_311.timeout = 60
# Filter
where_311 = "date_extract_y(created_date)>=2015"
# Data to CSV
data_311 = client_311.get(set_311, where=where_311, limit=2000)
df_311 = pd.DataFrame.from_records(data_311)
#df_311.to_csv("nyc_311.csv")

# Downloading NYC Tree data
url_tree = "data.cityofnewyork.us"
set_tree = "uvpi-gqnh"
client_tree = Socrata(url_tree, app_token)
client_tree.timeout = 60
# Filter
where_tree = "filter_conditions"
select_tree = "filter_conditions"
# Data to CSV
data_tree = client_tree.get(set_tree, where=where_tree, select=select_tree, limit=2000)
df_tree = pd.DataFrame.from_records(data_tree)
#df_tree.to_csv("nyc_tree.csv")


rent_path = "/Users/jin/data/zillow_rent_data.csv"          # Should be altered by users
df_rent = pd.read_csv(rent_path)
nyc_zipcodes_shp_path = "/Users/jin/data/nyc_zipcodes.shp"       # Should be altered by users
gdf_zipcode = gpd.read_file(nyc_zipcodes_shp_path)

# Coordiante Reference System
CRS = 4326

HTTPError: 400 Client Error: Bad Request.
	Query coordinator error: query.soql.no-such-column; No such column: filter_conditions; position: Map(row -> 1, column -> 8, line -> "SELECT `filter_conditions` WHERE `filter_conditions` LIMIT 2000\n       ^")

### Data Cleaning & Filtering 

In [None]:
def clean_filter(data_frame, column_remove, column_rename, column_type):
    """Removes unnecessary columns, rename columns, confirm types of columns"""
    data_frame.drop("column to remove", axis=1, inplace=True)
    data_frame.rename(columns=column_rename, inplace=True)
    for column, dtype in column_type.items():
        if dtype == 'datetime64':
            data_frame[column] = pd.to_datetime(data_frame[column], errors='coerce')
        else:
            data_frame[column] = data_frame[column].astype(dtype, errors='coerce')

In [None]:
# NYC 311 Cleaning & Filtering
# Columns to remove
remove_311 = [
    'Closed Date',
    'Agency',
    'Agency Name',
    'Descriptor',
    'Location Type',
    'Incident Address',
    'Street Name',
    'Cross Street 1',
    'Cross Street 2',
    'Intersection Street 1',
    'Intersection Street 2',
    'Address Type',
    'Landmark',
    'Facility Type',
    'Status',
    'Due Date',
    'Resolution Description',
    'Resolution Action Updated Date',
    'Community Board',
    'BBL',
    'Borough',
    'Open Data Channel Type',
    'Park Facility Name',
    'Park Borough',
    'Vehicle Type',
    'Taxi Company Borough',
    'Taxi Pick Up Location',
    'Bridge Highway Name',
    'Bridge Highway Direction',
    'Road Ramp',
    'Bridge Highway Segment'
]
# Columns to rename
rename_311 = {
    'Unique Key': 'complaint_id',
    'Created Date': 'date',
    'Complaint Type': 'complaint_type',
    'Incident Zip': 'zipcode',
    'City': 'city',
    'X Coordinate (State Plane)': 'x_coord',
    'Y Coordinate (State Plane)': 'y_coord',
    'Latitude': 'latitude',
    'Longitude': 'longitude',
    'Location': 'geometry'
}
# Column types
type_311 = {
    'Unique Key': 'INTEGER',
    'Created Date': 'DATETIME',
    'Complaint Type': 'TEXT',
    'Incident Zip': 'INTEGER',
    'City': 'TEXT',
    'X Coordinate (State Plane)': 'INTEGER',
    'Y Coordinate (State Plane)': 'INTEGER',
    'Latitude': 'FLOAT',
    'Longitude': 'FLOAT',
    'Location': 'TEXT'  #float...?
}

clean_filter(df_311, remove_311, rename_311, type_311)

In [None]:
# NYC Tree Cleaning & Filtering
# Columns to remove
remove_tree = [
    'created_at',
    'block_id',
    'tree_dbh',
    'stump_diam',
    'curb_loc',
    'spc_latin',
    'steward',
    'guards',
    'sidewalk',
    'user_type',
    'problems',
    'root_stone',
    'root_grate',
    'root_other',
    'trnk_wire',
    'trnk_light',
    'trnk_other',
    'brnch_ligh',
    'brnch_shoe',
    'brnch_othe',
    'address',   
    'cb_num',
    'borocode',
    'boroname',
    'cncldist',
    'st_assem',
    'st_senate',
    'nta',
    'nta_name',
    'boro_ct',
    'state',   
]
# Columns to rename
rename_tree = {
    'tree_id': 'tree_id',
    'the_geom': 'geometry',
    'status': 'status',
    'health': 'health',
    'spc_common': 'species',
    'zipcode': 'zipcode',
    'zip_city': 'city',
    'Latitude': 'latitude',
    'longitude': 'longitude'
    'x_sp': 'x_coord',
    'y_sp': 'y_coord',
}
# Column types
type_tree = {
    'tree_id': 'INTEGER',
    'the_geom': 'TEXT',
    'status': 'TEXT',
    'health': 'TEXT',
    'spc_common': 'TEXT',
    'zipcode': 'INTEGER',
    'zip_city': 'TEXT',
    'Latitude': 'FLOAT',
    'longitude': 'FLOAT',
    'x_sp': 'FLOAT',
    'y_sp': 'FLOAT',
}

clean_filter(df_tree, remove_tree, rename_tree, type_tree)

In [None]:
# Zillow Rent Cleaning & Filtering
# Columns to remove
remove_rent = [
    'RegionID',
    'SizeRank',
    'RegionType',
    'StateName',
    'Metro',
    'CountyName',
]
# Columns to rename
rename_rent = {
    'RegionName': 'zipcode',
    'State': 'state',
    'City': 'city',
}
# Column types
type_rent = {
    'RegionName': 'INTEGER',
    'State': 'TEXT',
    'city': 'TEXT'
}

# Convert date columns to datetime format
date_cols = pd.date_range('1/31/15', '9/30/23', freq='M')
df_rent[date_cols] = df_rent[date_cols].apply(pd.to_datetime, errors='coerce')

# Filter out columns between '09/31/2015' and '12/31/2022'
date_range_to_remove = pd.date_range('1/31/15', '9/30/22', freq='M')
df_rent = df_rent.drop(columns=date_range_to_remove, errors='ignore')

clean_filter(df_rent, remove_rent, rename_rent, type_rent)

In [None]:
# Zipcode Cleaning & Filtering
# Columns to remove
remove_gdf = [
    'BLDGZIP',
    'POPULATION',
    'AREA',
    'ST_FIPS',
    'CTY_FIPS',
    'URL',
    'SHAPE_AREA',
    'SHAPE_LEN',
]
# Columns to rename
rename_gdf = {
    'ZIPCODE': 'zipcode',
    'PO_NAME': 'city',
    'STATE': 'state',
    'COUNTY': 'county',
    'geometry': 'geometry',
}
# Column types
type_gdf = {
    'ZIPCODE': 'INTEGER',
    'PO_NAME': 'TEXT',
    'STATE': 'TEXT',
    'COUNTY': 'TEXT',
    'geometry': 'TEXT',
}

clean_filter(gdf_zipcode, remove_gdf, rename_gdf, type_gdf)