# Tools for Analytics Final Project ###

Jaeseop Shin / js6364  &. Hyunjin Jun / hj2642

## Environment Setup

In [None]:
# Importing all libraries used for the project

import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import geopandas as gpd
import requests
import psycopg2
import pathlib


from bs4 import BeautifulSoup
from sodapy import Socrata
from sqlalchemy import create_engine
from shapely.geometry import Point

# SQL Data
db_username = "user_name"
db_name = "name"
db_url = f"postgres+psycopg2://{db_username}@localhost/{db_name}"
db_schema = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1. Data Processing

### Data Setup

In [None]:
app_token = "2Hn2wwabCLXVYhGN4b9tEtJ11"


# Downloading NYC 311 data
url_311 = "data.cityofnewyork.us"
set_311 = "erm2-nwe9"
client_311 = Socrata(url_311, app_token)
client_311.timeout = 60
# Filtering data after 2015
where_311 = "date_extract_y(created_date)>=2015"
# Data to data frame
data_311 = client_311.get(set_311, where=where_311, limit=20)
df_311 = pd.DataFrame.from_records(data_311)
#df_311.to_csv("nyc_311.csv")

# Downloading NYC Tree data
url_tree = "data.cityofnewyork.us"
set_tree = "uvpi-gqnh"
client_tree = Socrata(url_tree, app_token)
client_tree.timeout = 60
# Data to data frame
data_tree = client_tree.get(set_tree, limit=20)
df_tree = pd.DataFrame.from_records(data_tree)
#df_tree.to_csv("nyc_tree.csv")

# Loading local data files
data_dir = pathlib.Path("data")
rent_path = data_dir / "zillow_rent_data.csv"          
df_rent = pd.read_csv(rent_path)
nyc_zipcodes_shp_path = data_dir / "nyc_zipcodes.shp"
gdf_zipcode = gpd.read_file(nyc_zipcodes_shp_path)
gdf_zipcode = gdf_zipcode.to_crs("EPSG:4326")

In [None]:
df_311.head()

### Data Cleaning & Filtering 

In [None]:
def clean_filter(data_frame, column_use, column_rename, column_type):
    """Removes unnecessary columns, rename columns, confirm types of columns, discard invalid data"""
    data_frame = data_frame[column_use]
    data_frame = data_frame.rename(columns=column_rename)
    for column, dtype in column_type.items():
        if dtype == 'datetime':
            data_frame[column] = pd.to_datetime(data_frame[column], errors='coerce')
        else:
            data_frame[column] = data_frame[column].astype(dtype, errors='raise')
    data_frame.dropna(inplace=True)
    return data_frame

In [None]:
# NYC 311 Cleaning & Filtering
# Columns to remove
use_311 = [
    'unique_key',
    'created_date',
    'complaint_type',
    'incident_zip',
    'city',
    'x_coordinate_state_plane',
    'y_coordinate_state_plane',
    'latitude',
    'longitude',
]
# Columns to rename
rename_311 = {
    'unique_key': 'complaint_id',
    'created_date': 'date',
    'complaint_type': 'complaint_type',
    'incident_zip': 'zipcode',
    'city': 'city',
    'x_coordinate_state_plane': 'x_coord',
    'y_coordinate_state_plane': 'y_coord',
    'latitude': 'latitude',
    'longitude': 'longitude',
}
# Column types
type_311 = {
    'complaint_id': 'int',
    'date': 'datetime',
    'complaint_type': 'str',
    'zipcode': 'int',
    'city': 'str',
    'x_coord': 'int',
    'y_coord': 'int',
    'latitude': 'float',
    'longitude': 'float',
}

df_311_f = clean_filter(df_311, use_311, rename_311, type_311)

# Converting latitude and longitude into Geometry data
df_311_f['geometry'] = df_311_f.apply(lambda x: Point(float(x['longitude']), float(x['latitude'])), axis=1)

# Converting DataFrame to GeoDataFrame
gdf_311_f = gpd.GeoDataFrame(df_311_f, geometry='geometry')

# Setting the coordinate reference system
gdf_311_f.set_crs(epsg=4326, inplace=True)
gdf_311_f.head()

In [None]:
# NYC Tree Cleaning & Filtering
# Columns to remove
use_tree = [
    'tree_id',
    'status',
    'health',
    'spc_common',
    'zipcode',
    'zip_city',
    'latitude',
    'longitude',
    'x_sp',
    'y_sp',
]
# Columns to rename
rename_tree = {
    'tree_id': 'tree_id',
    'status': 'status',
    'health': 'health',
    'spc_common': 'species',
    'zipcode': 'zipcode',
    'zip_city': 'city',
    'latitude': 'latitude',
    'longitude': 'longitude',
    'x_sp': 'x_coord',
    'y_sp': 'y_coord',
}
# Column types
type_tree = {
    'tree_id': 'int',
    'status': 'str',
    'health': 'str',
    'species': 'str',
    'zipcode': 'int',
    'city': 'str',
    'latitude': 'float',
    'longitude': 'float',
    'x_coord': 'float',
    'y_coord': 'float',
}

df_tree_f = clean_filter(df_tree, use_tree, rename_tree, type_tree)

# Converting latitude and longitude into Geometry data
df_tree_f['geometry'] = df_tree_f.apply(lambda x: Point(float(x['longitude']), float(x['latitude'])), axis=1)

# Converting DataFrame to GeoDataFrame
gdf_tree_f = gpd.GeoDataFrame(df_tree_f, geometry='geometry')

# Setting the coordinate reference system
gdf_tree_f.set_crs(epsg=4326, inplace=True)
gdf_tree_f.head()

In [None]:
# Zillow Rent Cleaning & Filtering
# Columns to remove
remove_rent = [
    'RegionID',
    'SizeRank',
    'RegionType',
    'StateName',
    'Metro',
    'CountyName',
]
# Columns to rename
rename_rent = {
    'RegionName': 'zipcode',
    'State': 'state',
    'City': 'city',
}
# Column types
type_rent = {
    'zipcode': 'int',
    'state': 'str',
    'city': 'str'
}

# Removing and renaming columns
df_rent_f = df_rent.drop(columns=remove_rent, axis=1)
df_rent_f = df_rent_f.rename(columns=rename_rent)
# Confirming Data type
for column, dtype in type_rent.items():
    df_rent_f[column] = df_rent_f[column].astype(dtype, errors='raise')
for column in df_rent_f.iloc[:, 3:]:
    df_rent_f[column] = df_rent_f[column].astype('float', errors='raise')
# Discarding blank data
df_rent_f.dropna(inplace=True)
df_rent_f.head()

In [None]:
# Zipcode Cleaning & Filtering
# Columns to remove
use_gdf = [
    'ZIPCODE',
    'PO_NAME',
    'STATE',
    'COUNTY',
    'geometry',
]
# Columns to rename
rename_gdf = {
    'ZIPCODE': 'zipcode',
    'PO_NAME': 'city',
    'STATE': 'state',
    'COUNTY': 'county',
    'geometry': 'geometry',
}
# Column types
type_gdf = {
    'zipcode': 'int',
    'city': 'str',
    'state': 'str',
    'county': 'str',
    'geometry': 'geometry',
}

gdf_zipcode_f = clean_filter(gdf_zipcode, use_gdf, rename_gdf, type_gdf)
gdf_zipcode_f.head()

## Part 2. Storing Data

In [None]:
def setup_new_postgis_database(user_name, db_name):
    """Setting up a new PostGis Database"""
    os.system(f"createdb -U {user_name} {db_name}")
    os.system(f"psql -U {user_name} --dbname {db_name} -c 'CREATE EXTENSION postgis;'")

In [None]:
setup_new_postgis_database(db_username, db_name)

### Creating Tables

In [None]:
engine = create_engine(db_url)

In [None]:
# Defining the SQL statements to create 311, tree, rent, and zipcode tables
nyc_311_schema = """
CREATE TABLE IF NOT EXISTS nyc_311 
(
    complaint_id INTEGER PRIMARY KEY,
    date DATETIME,
    complaint_type TEXT,
    zipcode INTEGER,
    city TEXT,
    x_coord INTEGER,
    y_coord INTEGER,
    latitude FLOAT,
    longitude FLOAT,
    geometry POINT
);

CREATE INDEX IF NOT EXISTS idx_nyc_311_geom ON nyc_311 USING GIST (geometry);
"""

nyc_tree_schema = """
CREATE TABLE IF NOT EXISTS nyc_tree
(
    tree_id INTEGER PRIMARY KEY,
    status TEXT,
    health TEXT,
    species TEXT,
    zipcode INTEGER,
    city TEXT,
    latitude FLOAT,
    longitude FLOAT,
    x_coord FLOAT,
    y_coord FLOAT,
    geometry POINT
);

CREATE INDEX IF NOT EXISTS idx_nyc_tree_geom ON nyc_tree USING GIST (geometry);
"""

nyc_rent_schema = """
CREATE TABLE IF NOT EXISTS nyc_rent
(
    zipcode INTEGER PRIMARY KEY,
    state TEXT,
    city STRING
)
"""

nyc_zipcode_schema = """
CREATE TABLE IF NOT EXISTS nyc_zipcode
(
    zipcode INTEGER PRIMARY KEY,
    city TEXT,
    state TEXT,
    county TEXT,
    geometry POLYGON
);

CREATE INDEX IF NOT EXISTS idx_nyc_zipcode_geom ON nyc_zipcode USING GIST (geometry);
"""

In [None]:
# creating the required schema.sql file
with open(db_schema, "w") as f:
    f.write(nyc_311_schema)
    f.write(nyc_tree_schema)
    f.write(nyc_rent_schema)
    f.write(nyc_zipcode_schema)

In [None]:
# Executing the schema files to create tables
with engine.connect() as connection:
    pass

### Adding Data to Database

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    """Inserting dataframe into SQL tables"""
    with engine.connect() as connection:
        for table_name, df in tablename_to_dataframe.items():
            df.to_sql(table_name, con=connection, if_exists='replace', index=False)


In [None]:
# Dataframe to Table mapping
tablename_to_dataframe = {
    "nyc_zipcodes": gdf_zipcode_f,
    "311_complaints": gdf_311_f,
    "nyc_trees": gdf_tree_f,
    "nyc_rents": df_rent_f,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)