# Imports

In [1]:
import gzip

import geopandas as gpd
import pandas as pd
from shapely.geometry import LineString

# Data Loading

In [2]:
with gzip.open("data/brightkite_totalCheckins.txt.gz", "rb") as f:
    df_brightkite_checkins = pd.read_csv(
        f,
        delimiter="\t",
        names=["user_id", "check_in_time", "latitude", "longitude", "location_id"],
    )

In [3]:
with gzip.open("data/brightkite_edges.txt.gz", "rb") as f:
    df_brightkite_edges = pd.read_csv(
        f,
        delimiter="\t",
        names=["user_id", "user_friend_id"],
    )

# Data Inspection

In [4]:
df_brightkite_checkins.head()

Unnamed: 0,user_id,check_in_time,latitude,longitude,location_id
0,0,2010-10-17T01:48:53Z,39.747652,-104.99251,88c46bf20db295831bd2d1718ad7e6f5
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc


In [5]:
df_brightkite_edges.head()

Unnamed: 0,user_id,user_friend_id
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


# Remove Zero Location

In [6]:
df_brightkite_checkins = df_brightkite_checkins[
    (df_brightkite_checkins["latitude"] != 0)
    & (df_brightkite_checkins["longitude"] != 0)
]

# Remove NaN

In [7]:
df_brightkite_checkins.dropna(inplace=True)

# GeoDataFrame

In [8]:
gpd_brightkite_checkins = gpd.GeoDataFrame(
    df_brightkite_checkins,
    geometry=gpd.points_from_xy(
        df_brightkite_checkins["longitude"], df_brightkite_checkins["latitude"]
    ),
    crs="EPSG:4326",
)

In [9]:
gpd_brightkite_checkins.head()

Unnamed: 0,user_id,check_in_time,latitude,longitude,location_id,geometry
0,0,2010-10-17T01:48:53Z,39.747652,-104.99251,88c46bf20db295831bd2d1718ad7e6f5,POINT (-104.99251 39.74765)
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2,POINT (-105.07081 39.89138)
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79,POINT (-105.06853 39.89108)
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683,POINT (-104.99907 39.75047)
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc,POINT (-104.99634 39.75271)


# Data Aggregation by User

In [10]:
df_brightkite_checkins_aggregated = gpd_brightkite_checkins.groupby(
    by="user_id"
).aggregate(list)

In [11]:
df_brightkite_checkins_aggregated.head()

Unnamed: 0_level_0,check_in_time,latitude,longitude,location_id,geometry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[2010-10-17T01:48:53Z, 2010-10-16T06:02:04Z, 2...","[39.747652, 39.891383, 39.891077, 39.750469, 3...","[-104.99251, -105.070814, -105.068532, -104.99...","[88c46bf20db295831bd2d1718ad7e6f5, 7a0f88982aa...","[POINT (-104.99251 39.747652), POINT (-105.070..."
1,"[2010-10-15T15:49:55Z, 2010-10-14T15:30:30Z, 2...","[37.579963, 37.580304, 37.579963, 37.579963, 3...","[-122.343908, -122.343679, -122.343908, -122.3...","[6fc93c9c178be638f384e202a5e27c2c, 5e55a3bdab7...","[POINT (-122.343908 37.579963), POINT (-122.34..."
2,"[2010-10-17T01:54:45Z, 2010-09-27T02:12:45Z, 2...","[39.747652, 39.746871, 39.746871, 39.739897, 3...","[-104.99251, -105.010166, -105.010166, -104.92...","[88c46bf20db295831bd2d1718ad7e6f5, 4dfb8a75261...","[POINT (-104.99251 39.747652), POINT (-105.010..."
3,"[2010-10-17T11:44:21Z, 2010-10-17T10:17:26Z, 2...","[38.94511, 38.944444, 36.117798, 36.053911, 36...","[-77.451706, -77.455833, -115.172926, -115.172...","[200dced1036e9b5cd93511d5eb14f2f7, 1e8f97ff904...","[POINT (-77.451706 38.94511), POINT (-77.45583..."
4,"[2010-10-16T20:49:23Z, 2010-10-11T22:59:09Z, 2...","[37.824562, 37.579963, 37.579963, 37.762938, 3...","[-122.368844, -122.343908, -122.343908, -122.3...","[4626aab842645caa90ab573e357fa9b6, 6fc93c9c178...","[POINT (-122.368844 37.824562), POINT (-122.34..."


# Data Filtration
LineString cannot be initialized with one point

In [12]:
df_brightkite_checkins_filtered = df_brightkite_checkins_aggregated[
    df_brightkite_checkins_aggregated["geometry"].str.len() > 1
]

# Convert List of Points to LineString

In [13]:
df_brightkite_checkins_filtered["geometry"] = df_brightkite_checkins_filtered[
    "geometry"
].apply(lambda x: LineString(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brightkite_checkins_filtered["geometry"] = df_brightkite_checkins_filtered[


In [14]:
df_brightkite_checkins_filtered.head()

Unnamed: 0_level_0,check_in_time,latitude,longitude,location_id,geometry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[2010-10-17T01:48:53Z, 2010-10-16T06:02:04Z, 2...","[39.747652, 39.891383, 39.891077, 39.750469, 3...","[-104.99251, -105.070814, -105.068532, -104.99...","[88c46bf20db295831bd2d1718ad7e6f5, 7a0f88982aa...","LINESTRING (-104.99251 39.747652, -105.070814 ..."
1,"[2010-10-15T15:49:55Z, 2010-10-14T15:30:30Z, 2...","[37.579963, 37.580304, 37.579963, 37.579963, 3...","[-122.343908, -122.343679, -122.343908, -122.3...","[6fc93c9c178be638f384e202a5e27c2c, 5e55a3bdab7...","LINESTRING (-122.343908 37.579963, -122.343679..."
2,"[2010-10-17T01:54:45Z, 2010-09-27T02:12:45Z, 2...","[39.747652, 39.746871, 39.746871, 39.739897, 3...","[-104.99251, -105.010166, -105.010166, -104.92...","[88c46bf20db295831bd2d1718ad7e6f5, 4dfb8a75261...","LINESTRING (-104.99251 39.747652, -105.010166 ..."
3,"[2010-10-17T11:44:21Z, 2010-10-17T10:17:26Z, 2...","[38.94511, 38.944444, 36.117798, 36.053911, 36...","[-77.451706, -77.455833, -115.172926, -115.172...","[200dced1036e9b5cd93511d5eb14f2f7, 1e8f97ff904...","LINESTRING (-77.451706 38.94511, -77.455833 38..."
4,"[2010-10-16T20:49:23Z, 2010-10-11T22:59:09Z, 2...","[37.824562, 37.579963, 37.579963, 37.762938, 3...","[-122.368844, -122.343908, -122.343908, -122.3...","[4626aab842645caa90ab573e357fa9b6, 6fc93c9c178...","LINESTRING (-122.368844 37.824562, -122.343908..."


In [15]:
gpd_brightkite_checkins = gpd.GeoDataFrame(
    data=df_brightkite_checkins_filtered,
    geometry="geometry",
    crs="EPSG:4326",
)

# Remove redundant columns

In [16]:
gpd_brightkite_checkins.drop(columns=["latitude", "longitude"], axis=1, inplace=True)

# Convert LineString to List of Points (HF format)

In [17]:
gpd_brightkite_checkins["geometry"] = gpd_brightkite_checkins["geometry"].apply(
    lambda geom: list(geom.coords)
)

  gpd_brightkite_checkins["geometry"] = gpd_brightkite_checkins["geometry"].apply(lambda geom: list(geom.coords))


# Save to Parquet

In [18]:
gpd_brightkite_checkins.to_parquet("data/brightkite.parquet")