# NYPD_CALL_FOR_SERVICE:

a. To get the required columns, use this module: 


1.   get_area_of_interest(df_spark, interested_columns)


b. Preprocessing pipeline: Pass your data through these functions. (if your columns fall in those categories)

1.   valid_date_check(date)
2.   valid_time_check(time)
3.   reverse_geo_code_boros(df_spark, Latitude, Longitude, Boro, lat_index, long_index)
4.   refine_age_group_race(df_spark, victim_age_group=None, suspect_age_group=None, suspect_race=None, victim_race=None)
5.   refine_sex_gender_impute(df_spark, suspect_age=None, suspect_gender=None, victim_age=None, victim_gender=None)
6.   refine_precinct_jur(df_spark, precinct=None, Jur_code=None)



In [None]:
!pip install pyspark
!pip install openclean

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 45.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=515252bcf9ed61485142c835d504b499301c0a2d4be35e00ffa3edbc9eac1cbd
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0
Collecting openclean
  Downloading openclean-0.2.1-py3-none-any.whl (5.2 kB)
Collecting openclean-core==0.4.1
  Downloading openclean_core-0.4.1-py3-none-any.whl (267 kB)
[K     |████████████████████████████████| 267 

In [None]:
#importing packages required
from pyspark import SparkContext, SparkConf
import os
import requests
from six.moves import urllib
import sys 
import pandas as pd
import matplotlib 
import matplotlib as plt
import numpy as np
import scipy as sp
import IPython
from IPython import display
import sklearn
import random
import time
import warnings
import re
import matplotlib.pyplot as plt
%matplotlib inline
from openclean.pipeline import stream
from openclean.profiling.column import DefaultColumnProfiler
from openclean.data.source.socrata import Socrata
from openclean.pipeline import stream
from openclean.function.eval.datatype import IsDatetime
import datetime
import pandas as pd
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import StringType

In [None]:
from geopy.geocoders import ArcGIS
geocoder=ArcGIS()
#example:
geocoder.reverse('40.61157006600007, -73.74736517199995')

Location(11-64 Redfern Ave, Far Rockaway, New York 11691, USA, (40.61161616586613, -73.74738361194636, 0.0))

In [None]:
#Creating Spark Session
sc = SparkContext.getOrCreate();
spark = SparkSession(sc)

In [None]:
#Downloading file from NYC Open Data
#https://drive.google.com/uc?export=download&id=1tGcOWZ_eHLYYa5tH7ZavALeTgtdmRxlL
#File uploaded on drive to make it available for all

fn_src = 'https://drive.google.com/uc?export=download&id=1tGcOWZ_eHLYYa5tH7ZavALeTgtdmRxlL'
fn_dst = '/content/NYPD_CALL_FOR_SERVICE.csv'

from six.moves import urllib

if os.path.isfile(fn_dst):
    print('File has already been downloaded', fn_dst)
else:
    print('Fetching file. This may take a while...', fn_dst)
    urllib.request.urlretrieve(fn_src, fn_dst)
    print('File %s has been downloaded' % fn_dst)

File has already been downloaded /content/NYPD_CALL_FOR_SERVICE.csv


In [None]:
src = 'https://data.beta.nyc/dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/7caac650-d082-4aea-9f9b-3681d568e8a5/download/nyc_zip_borough_neighborhoods_pop.csv'
dst = 'nyc_zip_borough_neighborhoods_pop.csv'

#https://data.cityofnewyork.us/resource/h9gi-nx95.csv

from six.moves import urllib

if os.path.isfile(dst):
    print('File %s has already been downloaded' % dst)
else:
    urllib.request.urlretrieve(src, dst)
    print('File %s has been downloaded' % dst)

File nyc_zip_borough_neighborhoods_pop.csv has already been downloaded


In [None]:
#similarly, lets get them into pyspark rdd
def get_area_of_interest(df_spark, interested_columns):
  df_spark=df_spark.select(interested_columns)
  return df_spark

# 2. Module for date related columns

As the dataset is for the data from 2006 to 2020, we can see that there is data from unknown format of "1010-05-14" to the year 2020. We need to clean this. Over here, we remove the null values where the complaint date is <2006. 

In [None]:
import datetime
def valid_date_check(date, format):
  if isinstance(date, datetime.datetime):
    date=str(date.date())
  if date==None or date==" " or date=="":
      return False
  else:
    date_cpy=date
    split_date=date.split("-")
    format_date=format.split("-")
    if len(split_date)!=3 and len(format_date)!=3:
      date=date.split("/")
      format=format.split("/")
    else:
      date=split_date
      format=format_date
    if len(date)!=3:
      return False
    try:
      year=int(date[format.index('yyyy')])
      month=int(date[format.index('mm')])
      day=int(date[format.index('dd')])
      if year>=2006 and year<=2020:
        try:
          refined_date=datetime.datetime(year, month, day)
          return True
        except:
          return False
      else:
        return False
    except:
      return False

# 3. Module for time related columns

Similarly, lets check for the time as well. Here we must have time between 
the standard 24 hours.

In [None]:
#Deleting invalid time
def valid_time_check(time):
  if time==None or time==" " or time=="":
    return False
  else :
    cpy_time=time
    time=time.split(":")
    try:
      hour=int(time[0])
      mins=int(time[1])
      secs= int(time[2])
      # if hours is 24 then change it to 0 hours
      if hour == 24 and mins== 0 and secs == 0:
        hour=0
      try:
        newTime= datetime.time(hour,mins,secs)
        return True
      except :
        return False
    except:
      return False

#4. Module for Age Group and Race columns
The module works for only those columns whose column names are passed

In [None]:
def refine_age_group_race(df_spark, victim_age_group=None, suspect_age_group=None, suspect_race=None, victim_race=None):
  #params: dataframe, col names for the respective age, gender cols
  if victim_age_group:
    df_spark = df_spark.na.fill("UNKNOWN",subset=[victim_age_group])
  if suspect_age_group:
    df_spark = df_spark.na.fill("UNKNOWN",subset=[suspect_age_group])
  if suspect_race:
    df_spark = df_spark.na.fill("UNKNOWN",subset=[suspect_race])
  if victim_race:
    df_spark = df_spark.na.fill("UNKNOWN",subset=[victim_race])
  return df_spark

# 5. Module for Gender, Race Columns for suspects and victims

The module works for only those columns whose column names are passed

In [None]:
def refine_sex_gender_impute(df_spark, suspect_age=None, suspect_gender=None, victim_age=None, victim_gender=None):
  #params: dataframe, col names for the respective age, gender cols
  if suspect_age:
    df_spark=df_spark.na.fill("U",subset=[suspect_age])
  if victim_age:
    df_spark=df_spark.na.fill("U",subset=[victim_age])
  if suspect_gender:
    df_spark = df_spark.na.fill("UNKNOWN",subset=[suspect_gender])
  if victim_gender:
    df_spark = df_spark.na.fill("UNKNOWN",subset=[victim_gender])
  return df_spark

# 6.a: Module for Precinct, Jurisdiction Code:
  dropping the null values

  The module works for only those columns whose column names are passed along with the df

In [None]:
def valid_precinct_check(precinct):
  if precinct==None or precinct==" " or precinct=="":
    return False
  else :
    return True

def valid_jur_check(jur):
  if jur==None or jur==" " or jur=="":
    return False
  else :
    return True

# 6.b Module for Reverse Geocoding the boroughs using latitudes and longitudes.

1. First we will remove the rows where latitude, longitude and boroughs are null. (around 450 tuples removed)
2. Then, where the boroughs are empty, take the latitude and longitude value and reverse geocode it using the module "reverseGeocoder".
3. Impute the borough name retrived in the empty space.


### USING MASTER DATASET
In the case of geocoding, geocoder gives us the zipcodes based on the latitude and longitude values. Inturn, we can use the master dataset of zipcodes inorder to retrive the borough names



NOTE: The dataset can be downloaded from : https://data.beta.nyc/en/dataset/pediacities-nyc-neighborhoods/resource/7caac650-d082-4aea-9f9b-3681d568e8a5

In [None]:
def reverse_geo_code_boros(df_spark, Latitude, Longitude, Boro, lat_index, long_index):
  #select data where we have to impute
  df_temp_boro_clean=df_spark.filter((df_spark[Latitude].isNotNull()) & (df_spark[Longitude].isNotNull()))
  boro_cleaner=df_temp_boro_clean.filter((df_temp_boro_clean[Boro].isNull())|(df_temp_boro_clean[Boro]=='NEW YORK'))

  # print("We have "+ str(boro_cleaner.count())+ " points to impute")
  print("___intializing Zip Code Look up ____")
  print("____ imputing the points ____")


  #use your path for master dataset here. 
  df_zips=pd.read_csv(dst)
  zip_master={}
  zips=df_zips['zip']
  boro=df_zips['borough']
  for i, j in zip(zips, boro):
    zip_master[i]=j
  zip_master[10020]='Manhattan'
  zip_master[11249]='Brooklyn'

  def reverseGeoCoder(latitude, longitude):
    loc=geocoder.reverse(str(latitude)+', '+str(longitude), timeout=1000)
    zipCode=str(loc).split(",")[2][-5:]
    if not int(zipCode) in zip_master:
      boro="UNKNOWN"
    else:
      boro=zip_master[int(zipCode)]
    boro=boro.upper()
    return boro

  #creating UD function
  ud_func= udf(reverseGeoCoder, StringType())
  boro_cleaned_dataframe = boro_cleaner.withColumn(Boro, ud_func(boro_cleaner[lat_index], boro_cleaner[long_index]))

  #joining the imputed dataset to the maindataset and returning

  joiner_dataset=df_spark.filter((df_spark[Latitude].isNotNull()) & (df_spark[Boro]!='NEW YORK') & (df_spark[Longitude].isNotNull()) & (df_spark[Boro].isNotNull()))
  fin_df=joiner_dataset.union(boro_cleaned_dataframe)
  return fin_df

NYPD Call for service is of 20M Rows of data.

The size of dataset ~ 20M tuples. 
So, we need around 500 data points for 95% confidence level 
with 10% interval. 

The size of data is almost 1% of the data. So we can get it into our df now

In [None]:
df_spark=spark.read.option("header",True).csv(fn_dst,inferSchema=True)
df_spark=df_spark.sample(0.5)
df_spark.count()

467

# PROFILING TO CHECK FOR NULL VALUES IN ALL THE COLUMNS

In [None]:
pandasDF = df_spark.toPandas()
ds=stream(pandasDF)
#Creating profile of our dataset
profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
objectid,467,0,467,1.0,8.867279
cad_evnt_id,467,0,467,1.0,8.867279
create_date,467,0,7,0.014989,2.315489
incident_date,467,0,7,0.014989,2.315489
incident_time,467,0,1,0.002141,0.0
nypd_pct_cd,467,0,3,0.006424,0.477639
boro_nm,467,0,2,0.004283,0.399457
patrl_boro_nm,467,0,2,0.004283,0.399457
geo_cd_x,467,0,236,0.505353,7.393382
geo_cd_y,467,0,236,0.505353,7.393382


## a. Select the columns that are common with the original dataset:
'cad_evnt_id', 

'incident_date', 

'incident_time', 

'boro_nm',

'latitude', 

'longitude' 

'patrol_boro_nm'

**We can consider the primary key along with this**
**"cad_evnt_id"**


In [None]:
interested_columns_1=['cad_evnt_id', 'incident_date', 'boro_nm', 'latitude', 'longitude', 'patrl_boro_nm']
df_spark=get_area_of_interest(df_spark, interested_columns_1)

## b. Lets pass the dataset through the preprocessing pipeline

In [None]:
df_temp=df_spark.rdd

1. Date and Time

In [None]:
pandasDF.head()

Unnamed: 0,objectid,cad_evnt_id,create_date,incident_date,incident_time,nypd_pct_cd,boro_nm,patrl_boro_nm,geo_cd_x,geo_cd_y,radio_code,typ_desc,cip_jobs,add_ts,disp_ts,arrivd_ts,closng_ts,latitude,longitude
0,1168025,66231134,2020-02-17,2020-02-17,1899-12-30,75,BROOKLYN,PATROL BORO BKLYN NORTH,1019935,186241,75D,VISIBILITY PATROL: DIRECTED,Non CIP,2020-02-17 03:29:46,2020-02-17 03:29:46,2020-02-17 03:29:47,2020-02-17 08:06:06,40.677802,-73.871348
1,1168032,66245081,2020-02-17,2020-02-17,1899-12-30,75,BROOKLYN,PATROL BORO BKLYN NORTH,1020080,184471,75D,VISIBILITY PATROL: DIRECTED,Non CIP,2020-02-17 15:55:38,2020-02-17 15:55:38,2020-02-17 15:55:39,2020-02-17 16:56:56,40.672943,-73.870835
2,1168036,66237379,2020-02-17,2020-02-17,1899-12-30,75,BROOKLYN,PATROL BORO BKLYN NORTH,1020214,183053,75D,VISIBILITY PATROL: DIRECTED,Non CIP,2020-02-17 10:30:54,2020-02-17 10:30:55,2020-02-17 10:30:56,2020-02-17 11:12:55,40.66905,-73.870359
3,1168037,66235273,2020-02-17,2020-02-17,1899-12-30,75,BROOKLYN,PATROL BORO BKLYN NORTH,1020386,188964,75D,VISIBILITY PATROL: DIRECTED,Non CIP,2020-02-17 08:56:49,2020-02-17 08:56:50,2020-02-17 08:56:51,2020-02-17 09:24:52,40.685274,-73.869707
4,1168046,66228375,2020-02-17,2020-02-17,1899-12-30,75,BROOKLYN,PATROL BORO BKLYN NORTH,1020963,177905,75D,VISIBILITY PATROL: DIRECTED,Non CIP,2020-02-17 00:26:47,2020-02-17 00:26:47,2020-02-17 00:26:48,2020-02-17 01:49:56,40.654917,-73.867687


### From data profiling we found that the date and time has no NULL Values

In [None]:
# checking date and time for null values

df_temp_=df_temp.map(lambda x:(x, valid_date_check(x[1], "yyyy-mm-dd"))).filter(lambda x: x[1]==True)
df_temp=df_temp_.map(lambda x: x[0])

In [None]:
df_temp.take(1)

[Row(cad_evnt_id=66231134, incident_date=datetime.datetime(2020, 2, 17, 0, 0), boro_nm='BROOKLYN', latitude=40.67780157, longitude=-73.87134797, patrl_boro_nm='PATROL BORO BKLYN NORTH')]

3. Geocoding

In [None]:
#geospacial attributes imputation

df_temp=df_temp.toDF(schema=df_spark.schema)
df_spk=reverse_geo_code_boros(df_temp, 'latitude', 'longitude', 'boro_nm', -2, -1)

___intializing Zip Code Look up ____
____ imputing the points ____


Lets profile the data now.

In [None]:
pandasDF = df_spk.toPandas()
ds=stream(pandasDF)

#Creating profile of our dataset
profiles = ds.profile(default_profiler=DefaultColumnProfiler)
profiles.stats()

Unnamed: 0,total,empty,distinct,uniqueness,entropy
cad_evnt_id,467,0,467,1.0,8.867279
incident_date,467,0,7,0.014989,2.315489
boro_nm,467,0,2,0.004283,0.399457
latitude,467,0,236,0.505353,7.393382
longitude,467,0,236,0.505353,7.393382
patrl_boro_nm,467,0,2,0.004283,0.399457
