In [1]:
import urllib
import polars as pl
from zipfile import ZipFile

In [None]:
url = "https://www.cs.utah.edu/~lifeifei/research/tpq/NA.cnode"

In [None]:
na_road_network_df = pl.read_csv(url, has_header = False, sep = ' ')

In [None]:
len(na_road_network_df)

In [None]:
na_road_network_df.describe()

In [None]:
# notice these are normalized, still don't know what that means and how to get them back into normal values for lattitude/longitude
na_road_network_df.head()

In [None]:
len(na_road_network_df)

In [None]:
cal_nodes_df = pl.read_csv("https://www.cs.utah.edu/~lifeifei/research/tpq/cal.cnode", has_header = False, sep = " ")
cal_edges_df = pl.read_csv("https://www.cs.utah.edu/~lifeifei/research/tpq/cal.cedge", has_header = False, sep = " ")

In [None]:
len(cal_nodes_df), len(cal_edges_df)

In [None]:
cal_nodes_df.head()

In [None]:
# still don't know unit of the column 4, which is the l2 distance
cal_edges_df.head()

In [None]:
trip_df = pl.read_csv("/home/therealchainman/Downloads/gps_data.csv")
road_network_df = pl.read_csv("/home/therealchainman/Downloads/road_network.csv")
gt_df = pl.read_csv("/home/therealchainman/Downloads/ground_truth_route.csv")

In [None]:
trip_df.head()

In [None]:
len(trip_df)

In [None]:
trip_df.describe()

In [None]:
trip_df

In [None]:
road_network_df

In [None]:
gt_df

# TEST MAP_MATCHER AND ROAD_NETWORK DATASETS

In [2]:
"""
route table: contains the 0-indexed row number referring to an arc in the arcs table.  Ordered sequence of arcs on which the vehicle traveled.
track table: contains the longitude, latitude, and timestamp of each GPS point for a given vehicle trip.
nodes table: contains the longitude and latitude of each node in the road network.
arcs table: contains the 0-indexed row number of the source node and target node for each arc in the road network.
"""
zip_file = "/home/therealchainman/Downloads/map-matching-dataset.zip"
route_df, track_df, nodes_df, arcs_df = pl.DataFrame(), pl.DataFrame(), pl.DataFrame(), pl.DataFrame()
for content in ZipFile(zip_file).namelist():
    try:
        id = int(content.split("/")[0])
    except ValueError:
        continue
    if content.endswith(".route"):
        cur_df = pl.read_csv(ZipFile(zip_file).read(content), has_header = False, sep = '\t')
        cur_df.columns = ['arc_id']
        cur_df = cur_df.with_column(pl.lit(id).alias("id"))
        if route_df.is_empty():
            route_df = cur_df
        else:
            route_df.extend(cur_df)
    elif content.endswith(".track"):
        cur_df = pl.read_csv(ZipFile(zip_file).read(content), has_header = False, sep = '\t')
        cur_df.columns = ['long', 'lat', 'timestamp']
        cur_df = cur_df.with_column(pl.lit(id).alias("id"))
        if track_df.is_empty():
            track_df = cur_df
        else:
            track_df.extend(cur_df)
    elif content.endswith(".nodes"):
        cur_df = pl.read_csv(ZipFile(zip_file).read(content), has_header = False, sep = '\t')
        cur_df.columns = ['long', 'lat']
        cur_df = cur_df.with_column(pl.lit(id).alias("id"))
        node_id_series = pl.Series("node_id", range(len(cur_df)))
        cur_df = cur_df.with_column(node_id_series)
        if nodes_df.is_empty():
            nodes_df = cur_df
        else:
            nodes_df.extend(cur_df)
    elif content.endswith(".arcs"):
        cur_df = pl.read_csv(ZipFile(zip_file).read(content), has_header = False, sep = '\t')
        cur_df.columns = ['source_node_id', 'target_node_id']
        cur_df = cur_df.with_column(pl.lit(id).alias("id"))
        arc_id_series = pl.Series("arc_id", range(len(cur_df)))
        cur_df = cur_df.with_column(arc_id_series)
        if arcs_df.is_empty():
            arcs_df = cur_df
        else:
            arcs_df.extend(cur_df)
route_df = route_df.select(['id', 'arc_id'])
track_df = track_df.select(['id', 'long', 'lat', 'timestamp'])
nodes_df = nodes_df.select(['id', 'node_id', 'long', 'lat'])
arcs_df = arcs_df.select(['id', 'arc_id', 'source_node_id', 'target_node_id'])

In [None]:
# TODO: sanity checks

In [None]:
arc_id_series = pl.Series("arc_id", range(len(arcs_df)))

In [None]:
arcs_df = arcs_df.with_column(arc_id_series)

In [None]:
arcs_df = arcs_df.select(["arc_id", "id", "source_node_id", "target_node_id"])

In [None]:
arcs_df

In [None]:
len(route_df), len(track_df), len(nodes_df), len(arcs_df)

In [None]:
nodes_df


# DELTA LAKE

In [5]:
data_path = "/home/therealchainman/data/bear_delta_lake"

In [None]:
from deltalake.writer import write_deltalake
import pandas as pd
df = pd.DataFrame({"x": [1, 2, 3]})
write_deltalake("/home/therealchainman/data/bear_delta_lake", df)

In [None]:
df2 = pd.DataFrame({"x": [9, 8, 10]})
write_deltalake("/home/therealchainman/data/bear_delta_lake", df2, mode = "append")

In [None]:
df3 = pd.DataFrame({"x": [55, 66, 77]})
write_deltalake("/home/therealchainman/data/bear_delta_lake", df3, mode = "overwrite")

In [6]:
pl.read_delta(data_path)

x
i64
55
66
77


In [8]:
ldf = pl.scan_delta(data_path)
ldf.collect()

x
i64
55
66
77


## TIME TRAVEL IN POLARS

In [9]:
pl.read_delta(data_path, version = 0)

x
i64
1
2
3


In [10]:
pl.read_delta(data_path, version = 1)

x
i64
1
2
3
9
8
10
