In [1]:
import dask.dataframe as dd
from haversine import haversine

high_schools = dd.read_csv("../data/additional_data/schools/high_schools_NYC_2021_processed.csv")
middle_schools = dd.read_csv("../data/additional_data/schools/middle_schools_NYC_2021_processed.csv").compute()

In [2]:
high_schools = high_schools.repartition(npartitions=1)

In [3]:
import numpy as np
import dask

test = None

results = []

for index, row in high_schools.iterrows():
    tmp = row.to_frame().T.assign(key=0).set_index("key")
    tmp = tmp.merge(middle_schools.assign(key=0).set_index("key"), how="inner", left_index=True, right_index=True)
    tmp = tmp.assign(distance=tmp.apply(lambda row: haversine((row['Latitude_x'], row['Longitude_x']), (row['Latitude_y'], row['Longitude_y'])), axis=1))
    tmp = tmp.sort_values("distance").groupby(['school_name']).first().reset_index()
    results.append((tmp["name"].values[0], tmp["distance"].values[0]))

results = np.array(results)

high_schools = high_schools.assign(Closest_Middle_School=dask.array.from_array(results[:, 0]))
high_schools = high_schools.assign(Distance_to_CMS=dask.array.from_array(results[:, 1]))
high_schools = high_schools.reset_index()

In [4]:
high_schools.compute()

Unnamed: 0,index,school_name,Latitude,Longitude,Closest_Middle_School,Distance_to_CMS
0,0,Orchard Collegiate Academy,40.713684,-73.986336,University Neighborhood Middle School,0.0
1,1,University Neighborhood High School,40.712399,-73.984497,P.S. 184m Shuang Wen,0.19657732222615024
2,2,East Side Community School,40.729589,-73.982555,East Side Community School,0.0
3,3,Forsyth Satellite Academy,40.722520,-73.990728,School for Global Leaders,0.47956312756173775
4,4,"New Explorations into Science, Technology and ...",40.718895,-73.979308,"New Explorations into Science, Technology & Math",0.0
...,...,...,...,...,...,...
437,437,EBC High School for Public Service - Bushwick,40.694507,-73.929107,Madiba Prep Middle School,0.34501750725612484
438,438,"Brooklyn School for Social Justice, The",40.697185,-73.911170,All City Leadership Secondary School,0.16994244445222528
439,439,"Academy of Urban Planning and Engineering, The",40.697185,-73.911170,All City Leadership Secondary School,0.16994244445222528
440,440,All City Leadership Secondary School,40.697370,-73.913171,All City Leadership Secondary School,0.0


In [3]:
merged = dd.map_partitions(
    lambda df: df.assign(key=0).set_index("key").merge(middle_schools.assign(key=0).set_index("key"), how="left", left_index=True, right_index=True),
    high_schools,
)

merged = merged.map_partitions(
    lambda df: df.assign(distance=df.apply(lambda row: haversine((row['Latitude_x'], row['Longitude_x']), (row['Latitude_y'], row['Longitude_y'])), axis=1))
)

merged = merged.map_partitions(
    lambda df: df.sort_values("distance").groupby(['school_name']).first()
)

merged.compute()

Unnamed: 0_level_0,Latitude_x,Longitude_x,name,Latitude_y,Longitude_y,distance
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Orchard Collegiate Academy,40.713684,-73.986336,University Neighborhood Middle School,40.713684,-73.986336,0.000000
University Neighborhood High School,40.712399,-73.984497,P.S. 184m Shuang Wen,40.711549,-73.986542,0.196577
East Side Community School,40.729589,-73.982555,East Side Community School,40.729589,-73.982555,0.000000
Forsyth Satellite Academy,40.722520,-73.990728,School for Global Leaders,40.720581,-73.985645,0.479563
"New Explorations into Science, Technology and Math High School (NEST+m)",40.718895,-73.979308,"New Explorations into Science, Technology & Math",40.718895,-73.979308,0.000000
...,...,...,...,...,...,...
EBC High School for Public Service - Bushwick,40.694507,-73.929107,Madiba Prep Middle School,40.692126,-73.931731,0.345018
"Brooklyn School for Social Justice, The",40.697185,-73.911170,All City Leadership Secondary School,40.697370,-73.913171,0.169942
"Academy of Urban Planning and Engineering, The",40.697185,-73.911170,All City Leadership Secondary School,40.697370,-73.913171,0.169942
All City Leadership Secondary School,40.697370,-73.913171,All City Leadership Secondary School,40.697370,-73.913171,0.000000


In [5]:
from dask.distributed import Client, LocalCluster

cluster = LocalCluster(n_workers=1, threads_per_worker=16, memory_target_fraction=0.95, memory_limit='64GB')
client = Client(cluster)

tickets = dd.read_parquet("../data/parking_tickets/parquet/full_data_cleaned.parquet").repartition(npartitions=1)

In [9]:
from tqdm.notebook import tqdm
import numpy as np
import dask

test = None

results = []

for index, row in tqdm(tickets.iterrows()):
    tmp = row.to_frame().T.assign(key=0).set_index("key")
    tmp = tmp.merge(middle_schools.assign(key=0).set_index("key"), how="inner", left_index=True, right_index=True)
    tmp = tmp.assign(distance=tmp.apply(lambda row: haversine((row['Latitude_x'], row['Longitude_x']), (row['Latitude_y'], row['Longitude_y'])), axis=1))
    tmp = tmp.sort_values("distance").groupby(['school_name']).first().reset_index()
    results.append((tmp["name"].values[0], tmp["distance"].values[0]))

results = np.array(results)

tickets = tickets.assign(Closest_Middle_School=dask.array.from_array(results[:, 0]))
tickets = tickets.assign(Distance_to_CMS=dask.array.from_array(results[:, 1]))
tickets = tickets.reset_index()

0it [00:00, ?it/s]

In [None]:
tickets.compute()

In [5]:
merged = dd.map_partitions(
    lambda df: df.assign(key=0).set_index("key").merge(middle_schools.assign(key=0).set_index("key"), how="left", left_index=True, right_index=True),
    tickets,
)

In [6]:
merged = merged.map_partitions(
    lambda df: df.assign(distance=df.apply(lambda row: haversine((row['Latitude_x'], row['Longitude_x']), (row['Latitude_y'], row['Longitude_y'])), axis=1))
)

In [7]:
merged = merged.map_partitions(
    lambda df: df.sort_values("distance").groupby(['Summons Number']).first()
)