In [1]:
import pandas as pd 
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, DateTime
from sqlalchemy.dialects.sqlite import DATETIME

In [2]:
df = pd.read_csv('../Resources/collision_2022.csv')

In [3]:
df = df.drop_duplicates(subset=['collision_id'], keep='first').set_index('collision_id')

In [4]:
df['zip_code'] = df['zip_code'].astype(str)

In [5]:
df['crash_datetime'] = pd.to_datetime(df['crash_datetime'])

In [6]:
df.drop(columns='location', inplace=True)

In [7]:
df.rename(columns={'vehicle_type_code1': 'vehicle_type_code_1', 'vehicle_type_code2':'vehicle_type_code_2'}, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103789 entries, 4513547 to 4648224
Data columns (total 25 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_datetime                 103789 non-null  datetime64[ns]
 1   crash_day_of_week              103789 non-null  int64         
 2   street_address                 103787 non-null  object        
 3   number_of_persons_injured      103789 non-null  int64         
 4   number_of_persons_killed       103789 non-null  int64         
 5   number_of_pedestrians_injured  103789 non-null  int64         
 6   number_of_pedestrians_killed   103789 non-null  int64         
 7   number_of_cyclist_injured      103789 non-null  int64         
 8   number_of_cyclist_killed       103789 non-null  int64         
 9   number_of_motorist_injured     103789 non-null  int64         
 10  number_of_motorist_killed      103789 non-null  int64         
 11

In [9]:
demo_df = pd.read_excel('../Resources/NewYork_DemographicsByZipCode.xlsx')
demo_df = demo_df.iloc[6:]
demo_df = demo_df.rename(columns={'Geography': 'zip_code'}, inplace=False)
demo_df = demo_df.rename(columns={'Best Population Estimate': 'Population'}, inplace=False)
demo_df = demo_df.rename(columns={demo_df.columns[7]: 'Population Density'}, inplace=False)
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1825 entries, 6 to 1830
Data columns (total 67 columns):
 #   Column                                                                       Non-Null Count  Dtype 
---  ------                                                                       --------------  ----- 
 0   zip_code                                                                     1825 non-null   object
 1   GEOID                                                                        1825 non-null   object
 2   State                                                                        1825 non-null   object
 3   County                                                                       4 non-null      object
 4   City                                                                         4 non-null      object
 5   Best Population Year                                                         1825 non-null   object
 6   Population                                      

In [10]:
demo_df = demo_df[['zip_code','Population Density','Population']].reset_index(drop=True)
demo_df.head()

Unnamed: 0,zip_code,Population Density,Population
0,10001,43227.1,26966
1,10002,93406.1,76807
2,10003,95235.0,54447
3,10004,10525.1,4795
4,10005,,8637


In [11]:
df = df.reset_index().merge(demo_df, on='zip_code', how='left').set_index('collision_id')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103789 entries, 4513547 to 4648224
Data columns (total 27 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_datetime                 103789 non-null  datetime64[ns]
 1   crash_day_of_week              103789 non-null  int64         
 2   street_address                 103787 non-null  object        
 3   number_of_persons_injured      103789 non-null  int64         
 4   number_of_persons_killed       103789 non-null  int64         
 5   number_of_pedestrians_injured  103789 non-null  int64         
 6   number_of_pedestrians_killed   103789 non-null  int64         
 7   number_of_cyclist_injured      103789 non-null  int64         
 8   number_of_cyclist_killed       103789 non-null  int64         
 9   number_of_motorist_injured     103789 non-null  int64         
 10  number_of_motorist_killed      103789 non-null  int64         
 11

In [13]:
collision_by_zip_df = df.groupby('zip_code').size()
collision_by_zip_df = collision_by_zip_df.reset_index().rename(columns={"zip_code":"zipcodes",0:"# of Collisions"})
collision_by_zip_df['# of Collisions'] = collision_by_zip_df['# of Collisions'].astype('int')
collision_by_zip_df['zipcodes'] = collision_by_zip_df['zipcodes'].astype('str')
collision_by_zip_df.set_index('zipcodes', inplace=True)
collision_by_zip_df.head()

Unnamed: 0_level_0,# of Collisions
zipcodes,Unnamed: 1_level_1
10000,30
10001,590
10002,880
10003,415
10004,89


In [14]:
engine = create_engine('sqlite:///../Resources/collision_db.sqlite')

In [15]:
metadata = MetaData()

In [16]:
collision_by_zip = Table('collision_by_zip', metadata,
    Column('zipcodes', String, primary_key=True),
    Column('# of Collisions', Integer),
)
metadata.create_all(engine)

In [17]:
collision_by_zip_df.to_sql('collision_by_zip', engine, if_exists='append', index=True)

245

In [18]:
import json

# Define the file path
file_path = '../Resources/nyc_geojson_by_zip.json'

# Open the file and load the GeoJSON data
with open(file_path, 'r') as file:
    geojson_data = json.load(file)

In [19]:
# add the collision count to the geojson data
for feature in geojson_data['features']:
    zipcode = feature['properties']['postalCode']
    if zipcode in collision_by_zip_df.index:
        feature['properties']['collision_count'] = collision_by_zip_df.loc[zipcode]['# of Collisions']
    else:
        feature['properties']['collision_count'] = 0

In [20]:
import numpy as np
# Custom function to handle non-serializable data types
def convert(o):
    if isinstance(o, np.integer):
        return int(o)
    raise TypeError

# Use json.dump with the custom function to handle non-serializable data
with open('../static/data/nyc_geojson_by_zip_with_counts.json', 'w') as file:
    json.dump(geojson_data, file, default=convert)

In [21]:
motor_collisions = Table('motor_collisions', metadata,
                         Column('collision_id', Integer, primary_key=True),
                         Column('crash_datetime', DateTime),
                         Column('crash_day_of_week', Integer),
                         Column('street_address', String), 
                         Column('borough', String),
                         Column('county', String),
                         Column('zip_code', String),
                         Column('latitude', Float),
                         Column('longitude', Float),
                         Column('number_of_persons_injured', Integer),
                         Column('number_of_persons_killed', Integer),
                         Column('number_of_pedestrians_injured', Integer),
                         Column('number_of_pedestrians_killed', Integer),
                         Column('number_of_cyclist_injured', Integer),
                         Column('number_of_cyclist_killed', Integer),
                         Column('number_of_motorist_injured', Integer),
                         Column('number_of_motorist_killed', Integer),
                         Column('contributing_factor_vehicle_1', String),
                         Column('contributing_factor_vehicle_2', String),
                         Column('contributing_factor_vehicle_3', String),
                         Column('contributing_factor_vehicle_4', String),
                         Column('contributing_factor_vehicle_5', String),
                         Column('vehicle_type_code_1', String),
                         Column('vehicle_type_code_2', String),
                         Column('vehicle_type_code_3', String),
                         Column('vehicle_type_code_4', String),
                         Column('vehicle_type_code_5', String),
                         Column('Population', String),
                         Column('Population Density', String)
                         )
metadata.create_all(engine)

In [22]:
df.to_sql('motor_collisions', con=engine, if_exists='append', index=True)

103789

In [23]:
from sqlalchemy import inspect

inspector = inspect(engine)
print(inspector.get_table_names())  # Check if the table is listed
print(inspector.get_columns('motor_collisions'))  # Check the structure of the table


['collision_by_zip', 'motor_collisions']
[{'name': 'collision_id', 'type': INTEGER(), 'nullable': False, 'default': None, 'autoincrement': 'auto', 'primary_key': 1}, {'name': 'crash_datetime', 'type': DATETIME(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'primary_key': 0}, {'name': 'crash_day_of_week', 'type': INTEGER(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'primary_key': 0}, {'name': 'street_address', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'primary_key': 0}, {'name': 'borough', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'primary_key': 0}, {'name': 'county', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'primary_key': 0}, {'name': 'zip_code', 'type': VARCHAR(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'primary_key': 0}, {'name': 'latitude', 'type': FLOAT(), 'nullable': True, 'default': None, 'autoincrement': 'auto', 'pr

In [24]:
from sqlalchemy.ext.automap import automap_base
Base = automap_base()
Base.prepare(engine, reflect=True)
Base.classes.keys()

['collision_by_zip', 'motor_collisions']