In [1]:
import pandas as pd
import numpy as np
import os

year = 2024

os.chdir("../data_scripts/")

from to_hdf5 import read_hdf5, save_to_hdf5

os.chdir("../notebooks/")

In [2]:
# tmp = pd.read_parquet("/home/rjutr/big-data-project/data/parking_tickets/parquet/filtered/2024_filtered.parquet")

In [3]:
# save_to_hdf5(tmp, "/home/rjutr/big-data-project/data/parking_tickets/hdf5/filtered", "2024_filtered")

In [4]:
data = read_hdf5(f"../data/parking_tickets/hdf5/filtered/{year}_filtered.h5")
data.shape

(6984398, 39)

In [5]:
weather_data = pd.read_csv("../data/additional_data/weather/weather_NYC_2013_2024_processed.csv", sep=";")
weather_data["datetime"] = pd.to_datetime(weather_data["datetime"]).astype(np.int64) // 10**6

ms_data = pd.read_csv("../data/additional_data/schools/middle_schools_NYC_2021_processed.csv")
hs_data = pd.read_csv("../data/additional_data/schools/high_schools_NYC_2021_processed.csv")
li_data = pd.read_csv("../data/additional_data/landmarks/landmarks_NYC_individual_processed.csv")
ls_data = pd.read_csv("../data/additional_data/landmarks/landmarks_NYC_scenic_processed.csv")

In [10]:
import h5py
import numpy as np

def process_type(type):
    if type == np.int64 or type == np.int32:
        return "<i8"
    if type == np.dtype("O") or type == "string":
        return h5py.string_dtype(encoding="utf-8")
    if type == np.float64:
        return "float"
    raise ValueError(f"Unknown type {type}")

def save_to_hdf5_test(data_processed, dropoff, filename):
    data_types = [
        (name, process_type(type)) for name, type in data_processed.dtypes.items()
    ]
    array = np.empty(len(data_processed), dtype=data_types)
    for i, column in enumerate(data_processed.columns):
        array[column] = data_processed[column]
    print(array)
    print(data_types)
    # with h5py.File(os.path.join(dropoff, f"{filename}.h5"), "w") as h5df:
    #     h5df.create_dataset("data", data=array, compression="gzip", compression_opts=9)

In [7]:
from rtree.index import Index
from tqdm import tqdm
from haversine import haversine


def get_nearest_location(idx, lat, lang):
    hit = list(idx.nearest((lat, lang, lat, lang), 1, objects=True))[0].object
    return (hit["name"], haversine((lat, lang), (hit["lat"], hit["long"])))


def computational_wrapper(row, lat_i, long_i, idx):
    return get_nearest_location(idx, row[lat_i], row[long_i])


def process_merge(
    data, augment_data, new_name, distance_name, name="name", save_location=None
):
    if os.path.exists(save_location):
        return read_hdf5(save_location)
    idx = Index()
    for i, row in tqdm(
        enumerate(augment_data.iterrows()),
        desc="Builing location index",
        total=augment_data.shape[0],
    ):
        row = row[1]
        idx.insert(
            i,
            (row["Latitude"], row["Longitude"], row["Latitude"], row["Longitude"]),
            obj={"name": row[name], "lat": row["Latitude"], "long": row["Longitude"]},
        )

    lat_i = data.columns.tolist().index("Latitude") + 1
    long_i = data.columns.tolist().index("Longitude") + 1
    res = []

    for row in tqdm(
        data.itertuples(), total=data.shape[0], desc="Generating reference dataframe"
    ):
        res.append(computational_wrapper(row, lat_i, long_i, idx))

    res = pd.DataFrame(
        res,
        columns=[new_name, distance_name],
        index=data.index,
    )

    data = data.merge(res, how="left", left_index=True, right_index=True)
    file_loc_split = save_location.split("/")
    save_to_hdf5(data, "/".join(file_loc_split[:-1]), file_loc_split[-1].split(".")[0])
    return data

In [11]:
if os.path.exists(f"../data/augmented_data/tickets_weather_{year}.h5"):
    data_w_weather = read_hdf5(f"../data/augmented_data/tickets_weather_{year}.h5")
else:
    data_w_weather = data.merge(weather_data, how="left", left_on="Issue Date", right_on="datetime")
    save_to_hdf5_test(data_w_weather, f"../data/augmented_data/", f"tickets_weather_{year}")

[(1252994175, b'MBH9245', b'PA', b'PAS', 1688774400000, 40, b'SDN', b'KIA', b'M', 12690, 41700, 61090, 20231231, 108, 108, 968, 272834, b'0968', b'0000', b'1150P', b'Queens', b'F', b'39-41', b'60TH ST', b'', 0, 408, b'D', b'', b'BBBBBBB', b'ALL', b'ALL', b'WHITE',    0, 0, b'', b'', 40.7302249, -73.8594073, 1.6887744e+12, 30.7, 23.9, 25.9, 'Partially cloudy', 80.1, 17.8, 15.4)
 (1326049732, b'KXG5820', b'NY', b'PAS', 1688342400000, 14, b'SDN', b'HONDA', b'P', 59590,  9440,     0, 20240428, 109, 109, 109, 975295, b'0109', b'0000', b'0834P', b'Queens', b'', b'', b'PRINCE STREET', b'40 ROAD', 0, 408, b'E2', b'', b'BBBBBBB', b'ALL', b'ALL', b'GREY', 2007, 0, b'', b'', 40.7586145, -73.8316913, 1.6883424e+12, 31.1, 23.3, 26.9, 'Rain, Partially cloudy', 71.3, 14.7, 15.7)
 (1377539143, b'KSF8886', b'NY', b'PAS', 1688601600000, 46, b'DELV', b'JEEP', b'P', 24690, 36670, 36690, 20231017,  32,  32,  32, 972348, b'0032', b'0000', b'0940P', b'Manhattan', b'F', b'703', b'LENOX AVE', b'', 0, 408, b'E2

In [None]:
%%time
data_w_w_ms = process_merge(data_w_weather, ms_data, "Closest Middle School", "Distance to CMS", "name", f"../data/augmented_data/tickets_w_ms_{year}.h5")
print(data_w_w_ms.shape)

In [7]:
del data_w_weather

In [8]:
%%time
data_w_w_ms_hs = process_merge(data_w_w_ms, hs_data, "Closest High School", "Distance to CHS", "school_name", f"../data/augmented_data/tickets_w_ms_hs_{year}.h5")
print(data_w_w_ms_hs.shape)

NameError: name 'data_w_w_ms' is not defined

In [9]:
del data_w_w_ms

NameError: name 'data_w_w_ms' is not defined

In [None]:
%%time
data_w_w_ms_hs_li = process_merge(data_w_w_ms_hs, li_data, "Closest Individual Landmark", "Distance to CIL", "LPC_NAME", f"../data/augmented_data/tickets_w_ms_hs_li_{year}.h5")
print(data_w_w_ms_hs_li.shape)

(8096326, 53)
CPU times: user 43.1 s, sys: 7.08 s, total: 50.2 s
Wall time: 1min 3s


In [None]:
del data_w_w_ms_hs

In [None]:
%%time
data_w_w_ms_hs_li_ls = process_merge(data_w_w_ms_hs_li, ls_data, "Closest Scenic Landmark", "Distance to CIS", "SCEN_LM_NA", f"../data/augmented_data/tickets_w_ms_hs_li_ls_{year}.h5")
print(data_w_w_ms_hs_li_ls.shape)

(8096326, 55)
CPU times: user 44 s, sys: 7.62 s, total: 51.6 s
Wall time: 1min 5s


In [None]:
del data_w_w_ms_hs_li

In [None]:
def get_nearest_locations(idx, lat, lang, n):
    return [
        item.object
        for item in list(idx.nearest((lat, lang, lat, lang), n, objects=True))
    ]


def computational_wrapper(row, lat_i, long_i, idx, time_i, n=1):
    locations = pd.DataFrame(get_nearest_locations(idx, row[lat_i], row[long_i], n))
    locations = locations[
        (locations["active_from"] <= row[time_i])
        & (locations["active_to"] >= row[time_i])
    ]
    if locations.empty:
        return computational_wrapper(row, lat_i, long_i, idx, time_i, n * 2)
    return (
        locations.iloc[0]["name"],
        locations.iloc[0]["industry"],
        haversine(
            (row[lat_i], row[long_i]),
            (locations.iloc[0]["lat"], locations.iloc[0]["long"]),
        ),
    )


def process_merge_2(
    data, augment_data, new_name, distance_name, name="name", save_location=None
):
    if os.path.exists(save_location):
        return read_hdf5(save_location)
    idx = Index()
    for i, row in tqdm(enumerate(augment_data.iterrows()), total=augment_data.shape[0]):
        row = row[1]
        idx.insert(
            i,
            (row["Latitude"], row["Longitude"], row["Latitude"], row["Longitude"]),
            obj={
                "name": row[name],
                "industry": row["Industry"],
                "lat": row["Latitude"],
                "long": row["Longitude"],
                "active_from": pd.Timestamp(
                    row["License Creation Date"], unit="ms"
                ).timestamp()
                * 1000,
                "active_to": pd.Timestamp(
                    row["License Expiration Date"], unit="ms"
                ).timestamp()
                * 1000,
            },
        )

    lat_i = data.columns.tolist().index("Latitude") + 1
    long_i = data.columns.tolist().index("Longitude") + 1
    time_i = data.columns.tolist().index("Issue Date") + 1
    res = []

    for row in tqdm(
        data.itertuples(), total=data.shape[0], desc="Generating reference dataframe"
    ):
        res.append(computational_wrapper(row, lat_i, long_i, idx, time_i))

    res = pd.DataFrame(
        res,
        columns=[new_name, "Industry of CB", distance_name],
        index=data.index,
    )

    data = data.merge(res, how="left", left_index=True, right_index=True)
    file_loc_split = save_location.split("/")
    save_to_hdf5(data, "/".join(file_loc_split[:-1]), file_loc_split[-1].split(".")[0])
    return data

In [None]:
b_data = pd.read_csv("../data/additional_data/businesses/businesses_NYC_2023_processed.csv")

In [None]:
%%time
data_w_ms_hs_li_ls_b= process_merge_2(data_w_w_ms_hs_li_ls, b_data, "Closest Business", "Distance to CB", "Business Name", f"../data/augmented_data/tickets_w_ms_hs_li_ls_b_{year}.h5")
print(data_w_ms_hs_li_ls_b.shape)

100%|██████████| 162222/162222 [00:11<00:00, 13886.64it/s]
Generating reference dataframe: 100%|██████████| 8096326/8096326 [2:22:43<00:00, 945.42it/s]   


(8096326, 58)
CPU times: user 2h 39min 43s, sys: 10.5 s, total: 2h 39min 54s
Wall time: 2h 39min 48s
