In [1]:
import pandas as pd
from datetime import datetime
import geopandas as gpd
import warnings
warnings.filterwarnings('ignore')
import datetime as dt
import numpy as np

In [2]:
# https://data.cityofnewyork.us/download/i8iw-xf4u/application%2Fzip
# https://data.cityofnewyork.us/api/views/qgea-i56i/rows.csv?accessType=DOWNLOAD

# Download data from the url file above, and then put them into the current folder 

In [3]:
from zipfile import ZipFile
zf = ZipFile('ZIP_CODE_040114.zip', 'r')
zf.extractall('zips')
zf.close()

In [4]:
data = pd.read_csv('NYPD_Complaint_Data_Historic.csv',error_bad_lines=False)
data.drop(['PARKS_NM','STATION_NAME','TRANSIT_DISTRICT','HADEVELOPT','HOUSING_PSA','PREM_TYP_DESC','SUSP_AGE_GROUP',\
           'SUSP_SEX','SUSP_RACE','JURISDICTION_CODE','ADDR_PCT_CD','PD_CD','PD_DESC','PATROL_BORO','TRANSIT_DISTRICT',\
           'VIC_AGE_GROUP','VIC_SEX','JURIS_DESC','CRM_ATPT_CPTD_CD','CMPLNT_TO_DT','CMPLNT_TO_TM','LOC_OF_OCCUR_DESC',\
           'VIC_RACE','Lat_Lon','RPT_DT','KY_CD'],axis='columns', inplace=True)

In [5]:
data.drop_duplicates(subset = ['CMPLNT_NUM'], keep='first', inplace=True, ignore_index=False)
data.dropna(subset=['CMPLNT_FR_DT','CMPLNT_FR_TM','Latitude','Longitude',], inplace=True)
data['CMPLNT_FR_DT']= pd.to_datetime(data['CMPLNT_FR_DT'], errors='coerce')
data = data[data['CMPLNT_FR_DT'].dt.year >= 2016]

In [6]:
zips = gpd.read_file('zips/ZIP_CODE_040114.shp')
data_geo = gpd.GeoDataFrame(data, geometry=gpd.points_from_xy(data.X_COORD_CD, data.Y_COORD_CD))
data_spa = gpd.sjoin(data_geo,zips, how = 'inner',op = 'within')
data_pd = pd.DataFrame(data_spa)

In [7]:
data_pd = data_pd.reset_index(drop=True)
data_pd.drop(['BLDGZIP','index_right','SHAPE_AREA','SHAPE_LEN','URL','CTY_FIPS','STATE','ST_FIPS','COUNTY'],axis='columns', inplace=True)

In [8]:
df1 = data_pd.drop_duplicates(subset='ZIPCODE', keep='first', inplace=False, ignore_index=True)
df1 = df1[['ZIPCODE','POPULATION','AREA']]
data_temp = data_pd.groupby(by='ZIPCODE', as_index=False).count()[['ZIPCODE','CMPLNT_NUM']].rename(columns={'CMPLNT_NUM':'Numbers_complaints'})
data_temp = data_temp.merge(df1, how = 'left')


In [9]:
data_temp['crime_norm_population'] = data_temp.Numbers_complaints.div(data_temp.POPULATION)
data_temp.loc[~np.isfinite(data_temp['crime_norm_population']), 'crime_norm_population'] = 0
data_temp['crime_norm_area'] = data_temp.Numbers_complaints.div(data_temp.AREA)

In [10]:
def normalize(data):
    out = data.copy()
    names = ['crime_norm_population','crime_norm_area','Numbers_complaints']
    for feature_name in names:
        max_value = data[feature_name].max()
        min_value = data[feature_name].min()
        out[feature_name] = (data[feature_name] - min_value) / (max_value - min_value)
    return out

In [11]:
data_normed = normalize(data_temp)
data_normed['score'] =  data_normed['crime_norm_population'] + data_normed['crime_norm_area']+ data_normed['Numbers_complaints']
data_sorted = data_normed.sort_values(by = ['score'], ascending = False,ignore_index=True)

In [12]:
data_sorted['safety_level'] = 0
data_sorted.safety_level.loc[range(39)] = 1
data_sorted.safety_level.loc[range(39,78)] = 2
data_sorted.safety_level.loc[range(78,117)] = 3
data_sorted.safety_level.loc[range(117,156)] = 4
data_sorted.safety_level.loc[range(156,196)] = 5

In [13]:
data_final = data_sorted[['ZIPCODE','safety_level']]
data_final.to_csv('zipcode_saftylevel.csv',index = False)