# NYC Rodent Sightings - 311 2018-2023

#### _Overview_ 
This notebook parses the 311 rodent sighting complaints for the 2018 to 2023 time span, and aggregates the sightings to the census block group spatial unit. One-hot encoding is used to create a boolean column per variables to sum the total count of sightings. 

- **Location Type** is simplified to a new columns called 'loc_type' with the following classifications: residential, residential-mixed, commercial, outdoor, vacant, and other. In the final census block dataset these are prefixed with `l_` indicating location.

- **Temporal** derivatives are calculated including year, month, day of week, and a simple time range. In the final census block dataset these are prefixed with `m_`, `d_`, and `t_` indicating month, day of week and time, respectively.

Caveats: 

    The final dataset has duplicate polygons: 1 per year 
    Duplicate reports on the same day from the same address are dropped. 


<u>Citations<u>
    
    MLA Format:
    NYC OpenData, Rat Sightings, https://data.cityofnewyork.us/Social-Services/Rat-Sightings/3q43-55fe.

    LaTeX Format:
    \bibitem{RatSightings}
    NYC OpenData. \emph{Rat Sightings}. https://data.cityofnewyork.us/Social-Services/Rat-Sightings/3q43-55fe.



In [None]:
from datetime import datetime, timedelta
import geopandas as gpd
import json
import pandas as pd
import mapclassify
import matplotlib.pyplot as plt
import numpy as np
import os
import requests
from io import StringIO
import warnings

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", None)
np.set_printoptions(threshold=np.inf)

In [None]:
# 311 Rat Sightings
params_sightings = {
    '$where': "created_date between '2018-01-01T00:00:00' and '2023-12-31T23:59:59'",
    '$limit': 500000  # Limit the number of records, adjust as needed
}
sightings_url = 'https://data.cityofnewyork.us/resource/3q43-55fe.geojson'
# https://data.cityofnewyork.us/Social-Services/Rat-Sightings/3q43-55fe

In [None]:
sightings_response = requests.get(sightings_url, params=params_sightings)
sightings_json = sightings_response.json()
sightings_gdf = gpd.GeoDataFrame.from_features(sightings_json['features'])
print(len(sightings_gdf))
sightings_gdf.head()

In [None]:
sightings_gdf.plot(figsize=(15,15))

# EDA on 311 Rodent sightings

Fields of interest:

    created_date
    location_type
    
Fields checked (but do not seem relevant):

    facility_type - empty
    location_state - empty
    agency - ALL DOHMH
    park_facility_name - ALL Unspecified
    agency_name - ALL Department of Health and Mental Hygiene
    descriptor - all Rat Sighting
    complaint_type - all Rodent
    status - 7% in progress. doesn't seem necessary to include
    

### Location Type explore

In [None]:
sightings_gdf.location_type.value_counts()

In [None]:
# Create a function to simplify location type

def simplify_location_type(location_type):
    loc_type = ""
    
    # Residential
    if location_type in ('Residential Building',
                         'Residential Property',
                         'Residence',
                         'Apartment',
                         'Private House',
                         '3+ Family Apt. Building', 
                         '3+ Family Apt',
                         '3+ Family Apt.',
                         '3+ Family Apartment Building',
                         '3+Family Apt.',
                         '1-3 Family Dwelling',
                         '1-2 FamilyDwelling',
                         '1-2 Family Dwelling'):
        loc_type = "Residential"
        
    # Residential - Mixed Use
    if location_type in ('3+ Family Mixed Use Building',
                         '1-3 Family Mixed Use Building',
                         '1-2 Family Mixed Use Building',
                         'Single Room Occupancy (SRO)'):
        loc_type = "Residential-Mixed"
    
    # Commercial/Public Facility
    if location_type in ('Commercial Building',
                         'Commercial Property',
                         'Retail Store',
                         'Grocery Store',
                         'Store',
                         'Restaurant',
                         'Restaurant/Bar/Deli/Bakery',
                         'Building (Non-Residential)',
                         'Day Care/Nursery',
                         'Office Building',
                         'Government Building',
                         'Hospital',
                         'Summer Camp',
                         'Cafeteria - Public School',
                         'School',
                         'School/Pre-School'):
        loc_type = "Commercial"

    # Public Outdoor
    if location_type in ('Street Area',
                         'Street Fair Vendor',
                         'Ground',
                         'Beach',
                         'Public Garden',
                         'Catch Basin/Sewer',
                         'Public Stairs'):
        loc_type = "Outdoor"
        
    # Buildings - Vacant
    if location_type in ('Construction Site',
                         'Vacant Lot',
                         'Vacant Lot/Property',
                         'Abandoned Building',
                         'Vacant Building',
                         'Parking Lot/Garage'):
        loc_type = "Vacant_Space"    
    
    # Other
    if location_type in ('Other (Explain Below)','N/A','Other','None','Catering Service',None):
        loc_type = "Other"      
    
    if loc_type == "":
        print(location_type)
    
    return loc_type

       
sightings_gdf['loc_type'] = sightings_gdf.apply(lambda row: simplify_location_type(row['location_type']), axis=1)
                      
sightings_gdf.loc_type.value_counts()


### Time of Day explore

In [None]:
# Create new column per range of hours

# morning (12am- 8am)
# midday(8am-4pm)
# Evening (4pm-12am)

def create_time_range(hour_of_day):
    if 0 < hour_of_day <= 6:
        return "Morning"
    elif 8 < hour_of_day <= 16:
        return "Midday"
    else: 
        return "Evening"

In [None]:
def create_date_cols(df):
    df['created_date'] = pd.to_datetime(df['created_date'])

    # Create new columns for month, day of the week, and hour of the day
    df['year'] = df['created_date'].dt.strftime('%Y')  # Year as a string
    df['month'] = df['created_date'].dt.strftime('%B')  # Month as a string
    df['day_of_week'] = df['created_date'].dt.strftime('%A')  # Day of the week as a string
    df['hour_of_day'] = df['created_date'].dt.hour  # Hour of the day as an int
    df['time_range'] = df.apply(lambda row: create_time_range(row['hour_of_day']), axis=1)
    return df

sightings_gdf = create_date_cols(sightings_gdf)
sightings_gdf.time_range.value_counts()

### Combine loc_type and hour of day

Do we want to add this in?

In [None]:
sightings_gdf['loc_type_per_time'] = sightings_gdf['loc_type'] + '_' + sightings_gdf['time_range']
sightings_gdf.loc_type_per_time.value_counts()

In [None]:
sightings_gdf['loc_type_per_dow'] = sightings_gdf['loc_type'] + '_' + sightings_gdf['day_of_week']
sightings_gdf.loc_type_per_dow.value_counts()

In [None]:
# Drop duplicates 
print(len(sightings_gdf))
sightings_gdf = sightings_gdf.drop_duplicates(subset=['incident_address','loc_type','year','month','day_of_week','time_range'], keep='first')
print(len(sightings_gdf))

# Summarize per Census Block Group

This geography seems very granular. Let's see how it works for our data

In [None]:
parent_dir = os.path.abspath('..')  # get the absolute path of the parent directory
file_path = os.path.join(parent_dir, 'Data', 'nyc_bgrp.geojson')  # construct the file path
cb_gdf = gpd.read_file(file_path)  # load the GeoJSON file into a GeoDataFrame

In [None]:
cb_gdf.head(2)

In [None]:
cb_gdf.plot(figsize=(30,30))

In [None]:
print(len(cb_gdf))

In [None]:
# Slim down the dataset
sighting_cols = ['geometry', 'loc_type', 'year', 'month', 'day_of_week', 'time_range']
sightings_gdf[sighting_cols].head()

In [None]:
# One hot encode the variables to columns

def encode_complaints(df, cols):
    # Encode
    loc_encoded = pd.get_dummies(df['loc_type'], prefix='l')
    dow_encoded = pd.get_dummies(df['day_of_week'], prefix='d')
    time_encoded = pd.get_dummies(df['time_range'], prefix='t')
    
    # Merge new variables
    sightings_merge = pd.merge(df[cols], loc_encoded, left_index=True, right_index=True)
    sightings_encoded_df = pd.merge(sightings_merge, dow_encoded, left_index=True, right_index=True)
    sightings_encoded_df = pd.merge(sightings_encoded_df, time_encoded, left_index=True, right_index=True)
    sightings_encoded_gdf = gpd.GeoDataFrame(sightings_encoded_df, geometry='geometry')

    return sightings_encoded_gdf

sightings_encoded_gdf = encode_complaints(sightings_gdf, sighting_cols)
sightings_encoded_gdf.head(5)

In [None]:
# Perform spatial join
sightings_joined_tracts_gdf = gpd.sjoin(sightings_encoded_gdf, cb_gdf[['geometry','spatial_id']], op='within', how='left')

def aggregate_sightings(df):
    # Aggregate the complaints
    loc_columns = [col for col in df.columns if col.startswith('l_')]
    dow_columns = [col for col in df.columns if col.startswith('d_')]
    time_columns = [col for col in df.columns if col.startswith('t_')]
    agg_cols = loc_columns + dow_columns + time_columns
    sightings_agg_dict = {col: ['sum'] for col in agg_cols}
    sightings_agg_dict['year'] = 'count'
    sightings_agg_df = df.groupby(['spatial_id','year']).agg(sightings_agg_dict).reset_index()
    sightings_agg_df.columns = sightings_agg_df.columns.map('_'.join)
    sightings_agg_df = sightings_agg_df.rename(columns={'year_count': 'num_sightings',
                                                        'year_':'year',
                                                        'spatial_id_':'spatial_id'})
    return sightings_agg_df
    

sightings_agg_df = aggregate_sightings(sightings_joined_tracts_gdf)
sightings_agg_df.head()


In [None]:
# Duplicate the census blocks per years in dataset 
years = sightings_agg_df.year.unique()

years_cb_gdf = pd.DataFrame() # empty dataframe

for year in years:
    year_cb_gdf = cb_gdf.copy()
    year_cb_gdf['year'] = year
    years_cb_gdf = pd.concat([years_cb_gdf, year_cb_gdf])

print(len(years_cb_gdf))

years_cb_gdf.head(5)


In [None]:
# Merge back to the census tracts 
sightings_gdf = pd.merge(years_cb_gdf, sightings_agg_df, on=['spatial_id','year'], how='left')

column_list = sightings_gdf.columns[8:]
sightings_gdf[column_list] = sightings_gdf[column_list].fillna(0)
sightings_gdf.head()

## Visualize Metrics 

In [None]:
years = sorted(years)
for year in years:
    sightings_annual_gdf = sightings_gdf[(sightings_gdf['year']==year)]
    # Natural Breaks (Jenks)
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    sightings_annual_gdf.plot(column='num_sightings',
                        ax=ax,
                        cmap='YlOrRd',
                        scheme='natural_breaks',
                        k=8,
                        legend=True,
                        missing_kwds={'color': 'lightgray'},
                        legend_kwds={'loc': 'upper left', 'title': 'Number of Sightings'})
    plt.title('Rodent Sighting Complaints (311): {0}'.format(year))
    plt.show()

## Plot variables for one year (2021)

In [None]:
sightings_slim_gdf = sightings_gdf[(sightings_gdf['year']=='2021')]
sightings_slim_gdf.sort_values(by='num_sightings', ascending=False).head(5)

In [None]:
def plot_map(column_name, title):
    fig, ax = plt.subplots(1, 1, figsize=(15, 15))
    sightings_slim_gdf.plot(column=column_name,
                        ax=ax,
                        cmap='YlOrRd',
                        scheme='natural_breaks',
                        k=8,
                        legend=True,
                        missing_kwds={'color': 'lightgray'},
                        legend_kwds={'loc': 'upper left', 'title': title})
    plt.show()

for column in column_list:
    plot_title = f'Number of Complaints - {column}'
    plot_map(column, plot_title)

# Export to csv 

Not exporting to GeoJson because the dataset would be very large. Helper code below to load.

In [None]:
export_columns = ['spatial_id'] + sightings_gdf.columns[7:].to_list()
print(export_columns)
parent_dir = os.path.abspath('..')  # get the absolute path of the parent directory
file_path = os.path.join(parent_dir, 'Data', 'sightings_per_year.csv')  # construct the file path
sightings_gdf[export_columns].to_csv(file_path, index=False)

### EXAMPLE 

In [None]:
parent_dir = os.path.abspath('..')  # get the absolute path of the parent directory

In [None]:
# Read Census Block Groups
cb_file_path = os.path.join(parent_dir, 'Data', 'nyc_bgrp.geojson')  # construct the file path
cb_gdf = gpd.read_file(cb_file_path)  # load the GeoJSON file into a GeoDataFrame
print(cb_gdf.dtypes)
print(len(cb_gdf))
cb_gdf.head(2)

In [None]:
# Read Sightings
sightings_file_path = os.path.join(parent_dir, 'Data', 'sightings_per_year.csv')  # construct the file path
sightings_df = pd.read_csv(sightings_file_path)
sightings_df['spatial_id'] = sightings_df['spatial_id'].astype(str)
print(len(sightings_df))
print(sightings_df.dtypes)
sightings_df.head(2)

In [None]:
sightings_gdf = pd.merge(cb_gdf, sightings_df, on=['spatial_id'], how='left')

print(len(sightings_gdf))
sightings_gdf.head(2)