In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data = pd.read_parquet("../data/parking_tickets/parquet/filtered/2015_filtered.parquet")

In [3]:
data.shape

(8299895, 39)

In [4]:
weather_data = pd.read_csv("../data/additional_data/weather/weather_NYC_2013_2024_processed.csv", sep=";")
ms_data = pd.read_csv("../data/additional_data/schools/middle_schools_NYC_2021_processed.csv")
hs_data = pd.read_csv("../data/additional_data/schools/high_schools_NYC_2021_processed.csv")
li_data = pd.read_csv("../data/additional_data/landmarks/landmarks_NYC_individual_processed.csv")
ls_data = pd.read_csv("../data/additional_data/landmarks/landmarks_NYC_scenic_processed.csv")

In [5]:
weather_data["datetime"] = pd.to_datetime(weather_data["datetime"]).astype(np.int64) // 10**6

In [6]:
from rtree.index import Index
from tqdm import tqdm


from haversine import haversine


def get_nearest_location(idx, lat, lang):
    hit = list(idx.nearest((lat, lang, lat, lang), 1, objects=True))[0].object
    return (hit["name"], haversine((lat, lang), (hit["lat"], hit["long"])))


def computational_wrapper(row, lat_i, long_i, idx):
    return get_nearest_location(idx, row[lat_i], row[long_i])


def process_merge(data, augment_data, new_name, distance_name, name="name", save_location=None):
    if os.path.exists(save_location):
        return pd.read_parquet(save_location)
    idx = Index()
    for i, row in tqdm(
        enumerate(augment_data.iterrows()),
        desc="Builing location index",
        total=augment_data.shape[0],
    ):
        row = row[1]
        idx.insert(
            i,
            (row["Latitude"], row["Longitude"], row["Latitude"], row["Longitude"]),
            obj={"name": row[name], "lat": row["Latitude"], "long": row["Longitude"]},
        )

    lat_i = data.columns.tolist().index("Latitude") + 1
    long_i = data.columns.tolist().index("Longitude") + 1
    res = []

    for row in tqdm(
        data.itertuples(), total=data.shape[0], desc="Generating reference dataframe"
    ):
        res.append(computational_wrapper(row, lat_i, long_i, idx))

    res = pd.DataFrame(
        res,
        columns=[new_name, distance_name],
        index=data.index,
    )

    data = data.merge(res, how="left", left_index=True, right_index=True)
    data.to_parquet(
        save_location
    )
    return data

In [7]:
%%time
if os.path.exists("../data/augmented_data/tickets_weather_2015.parquet"):
    data_w_weather = pd.read_parquet("../data/augmented_data/tickets_weather_2015.parquet")
else:
    data_w_weather = data.merge(weather_data, how="left", left_on="Issue Date", right_on="datetime")
    data_w_weather.to_parquet("../data/augmented_data/tickets_weather_2015.parquet")

CPU times: total: 24 s
Wall time: 28.6 s


In [8]:
data_w_weather.shape

(8299895, 47)

In [9]:
%%time
data_w_w_ms = process_merge(data_w_weather, ms_data, "Closest Middle School", "Distance to CMS", "name", "../data/augmented_data/tickets_w_ms_2015.parquet")

CPU times: total: 35.9 s
Wall time: 9.67 s


In [10]:
data_w_w_ms.shape

(8299895, 49)

In [11]:
%%time
data_w_w_ms_hs = process_merge(data_w_w_ms, hs_data, "Closest High School", "Distance to CHS", "school_name", "../data/augmented_data/tickets_w_ms_hs_2015.parquet")

KeyboardInterrupt: 

In [None]:
data_w_w_ms_hs.shape

(8299895, 51)

In [None]:
%%time
data_w_w_ms_hs_li = process_merge(data_w_w_ms_hs, li_data, "Closest Individual Landmark", "Distance to CIL", "LPC_NAME", "../data/augmented_data/tickets_w_ms_hs_li_2015.parquet")

Builing location index: 100%|██████████| 1531/1531 [00:00<00:00, 10372.48it/s]
Generating reference dataframe: 100%|██████████| 8299895/8299895 [14:10<00:00, 9757.97it/s] 


CPU times: total: 14min 44s
Wall time: 14min 50s


In [None]:
data_w_w_ms_hs_li.shape

(8299895, 53)

In [None]:
%%time
data_w_w_ms_hs_li_ls = process_merge(data_w_w_ms_hs_li, ls_data, "Closest Scenic Landmark", "Distance to CIS", "SCEN_LM_NA", "../data/augmented_data/tickets_w_ms_hs_li_ls_2015.parquet")

Builing location index: 100%|██████████| 11/11 [00:00<00:00, 5170.61it/s]
Generating reference dataframe: 100%|██████████| 8299895/8299895 [07:51<00:00, 17603.12it/s]


CPU times: total: 8min 26s
Wall time: 8min 33s


In [None]:
data_w_w_ms_hs_li_ls.shape

(8299895, 55)

In [None]:
def get_nearest_locations(idx, lat, lang, n):
    return [
        item.object
        for item in list(idx.nearest((lat, lang, lat, lang), n, objects=True))
    ]


def computational_wrapper(row, lat_i, long_i, idx, time_i, n=1):
    locations = pd.DataFrame(get_nearest_locations(idx, row[lat_i], row[long_i], n))
    locations = locations[
        (locations["active_from"] <= row[time_i])
        & (locations["active_to"] >= row[time_i])
    ]
    if locations.empty:
        return computational_wrapper(row, lat_i, long_i, idx, time_i, n * 2)
    return (
        locations.iloc[0]["name"],
        locations.iloc[0]["industry"],
        haversine(
            (row[lat_i], row[long_i]),
            (locations.iloc[0]["lat"], locations.iloc[0]["long"]),
        ),
    )


def process_merge_2(
    data, augment_data, new_name, distance_name, name="name", save_location=None
):
    if os.path.exists(save_location):
        return pd.read_parquet(save_location)
    idx = Index()
    for i, row in tqdm(enumerate(augment_data.iterrows()), total=augment_data.shape[0]):
        row = row[1]
        idx.insert(
            i,
            (row["Latitude"], row["Longitude"], row["Latitude"], row["Longitude"]),
            obj={
                "name": row[name],
                "industry": row["Industry"],
                "lat": row["Latitude"],
                "long": row["Longitude"],
                "active_from": pd.Timestamp(
                    row["License Creation Date"], unit="ms"
                ).timestamp()
                * 1000,
                "active_to": pd.Timestamp(
                    row["License Expiration Date"], unit="ms"
                ).timestamp()
                * 1000,
            },
        )

    lat_i = data.columns.tolist().index("Latitude") + 1
    long_i = data.columns.tolist().index("Longitude") + 1
    time_i = data.columns.tolist().index("Issue Date") + 1
    res = []

    for row in tqdm(
        data.itertuples(), total=data.shape[0], desc="Generating reference dataframe"
    ):
        res.append(computational_wrapper(row, lat_i, long_i, idx, time_i))

    res = pd.DataFrame(
        res,
        columns=[new_name, "Industry of CB", distance_name],
        index=data.index,
    )

    data = data.merge(res, how="left", left_index=True, right_index=True)
    data.to_parquet(
        save_location
    )
    return data

In [None]:
b_data = pd.read_csv("../data/additional_data/businesses/businesses_NYC_2023_processed.csv")

In [None]:
%%time
data_w_ms_hs_li_ls_b= process_merge_2(data_w_w_ms_hs_li_ls, b_data, "Closest Business", "Distance to CB", "Business Name", "../data/augmented_data/tickets_w_ms_hs_li_ls_b_2015.parquet")

100%|██████████| 172332/172332 [00:25<00:00, 6711.34it/s]
Generating reference dataframe:   1%|          | 44706/8299895 [01:21<4:09:48, 550.78it/s] 


KeyboardInterrupt: 

In [None]:
data_w_ms_hs_li_ls_b.shape