In [1]:
import dask.dataframe as dd
from haversine import haversine
from rtree.index import Index
import pandas as pd

In [2]:
tickets = dd.read_parquet('../data/parking_tickets/parquet/full_data_cleaned.parquet')
middle_schools = pd.read_csv('../data/additional_data/schools/middle_schools_NYC_2021_processed.csv')

In [3]:
tmp = tickets.head()

In [4]:
def create_rtree_index(df, name):
    idx = Index()
    for i, row in enumerate(df.iterrows()):
        row = row[1]
        idx.insert(
            i,
            (row["Latitude"], row["Longitude"], row["Latitude"], row["Longitude"]),
            obj={"name": row[name], "lat": row["Latitude"], "long": row["Longitude"]},
        )
    return idx


middle_schools_idx = create_rtree_index(middle_schools, "name")

In [5]:
def get_nearest_location(idx, lat, lang):
    hit = list(idx.nearest((lat, lang, lat, lang), 1, objects=True))[0].object
    return (hit["name"], haversine((lat, lang), (hit["lat"], hit["long"])))

def computational_wrapper(row, lat_i, long_i, idx):
    res[row[0]] = get_nearest_location(idx, row[lat_i], row[long_i])

In [6]:
lat_i = tmp.columns.tolist().index("Latitude") + 1
long_i = tmp.columns.tolist().index("Longitude") + 1

In [7]:
import joblib as jl
from tqdm_joblib import tqdm_joblib as tjl

res = {}

# for row in tmp.itertuples():
#     res.append(computational_wrapper(row, lat_i, long_i, middle_schools_idx))

with tjl(
        desc="Calculating violation coordinates", total=len(tmp)
    ) as progress_bar:
        jl.Parallel(n_jobs=32, require="sharedmem", prefer="threads")(
            jl.delayed(computational_wrapper)(row, lat_i, long_i, middle_schools_idx)
            for row in tmp.itertuples()
        )

  from tqdm.autonotebook import tqdm


Calculating violation coordinates:   0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
res = pd.DataFrame(res).T

In [9]:
res

Unnamed: 0,0,1
7443,Spruce Street School,0.473964
7733,Lower Manhattan Community Middle School,0.316734
8325,Hudson River Middle School,0.435904
7285,Spruce Street School,0.473964
8322,M.S. 936,0.616084


In [24]:
small_sample = tickets.head().apply(lambda x: computational_wrapper(x, middle_schools_idx), axis=1, result_type="expand")

In [23]:
tmp.merge(res, left_index=True, right_index=True)

Unnamed: 0,Summons Number,Plate ID,Registration State,Plate Type,Issue Date,Violation Code,Vehicle Body Type,Vehicle Make,Issuing Agency,Street Code1,...,To Hours In Effect,Vehicle Color,Vehicle Year,Feet From Curb,Violation Post Code,Violation Description,Latitude,Longitude,school,distance
7285,1351801788,41666JM,NY,COM,1372723200000,14,VAN,,P,45440,...,ALL,,0,0,,,40.707333,-74.007828,,
7443,1351801790,99208MC,NY,COM,1372723200000,14,VAN,,P,45440,...,ALL,,0,0,,,40.707333,-74.007828,,
7733,1351801806,49965JG,NY,COM,1372723200000,14,VAN,,P,13590,...,ALL,,0,0,,,40.70362,-74.010706,,
8322,1291521665,YANKYJAM,NY,PAS,1372723200000,46,SDN,BMW,F,5430,...,ALL,BLK,0,0,,,40.645503,-74.013282,,
8325,1335816963,XX865Y,NJ,PAS,1372723200000,78,DELV,CHEVR,P,24790,...,ALL,WHITE,0,0,,,40.71965,-74.010185,,
