# Imports

In [1]:
import gzip

import geopandas as gpd
import pandas as pd
from shapely.geometry import LineString
from tqdm.contrib.concurrent import process_map

# Data Loading

In [2]:
with gzip.open("data/gowalla_totalCheckins.txt.gz", "rb") as f:
    df_gowalla_checkins = pd.read_csv(
        f,
        delimiter="\t",
        names=["user_id", "check_in_time", "latitude", "longitude", "location_id"],
    )

In [3]:
with gzip.open("data/gowalla_edges.txt.gz", "rb") as f:
    df_gowalla_edges = pd.read_csv(
        f,
        delimiter="\t",
        names=["user_id", "user_friend_id"],
    )

# Data Inspection

In [4]:
df_gowalla_checkins.head()

Unnamed: 0,user_id,check_in_time,latitude,longitude,location_id
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878


In [5]:
df_gowalla_edges.head()

Unnamed: 0,user_id,user_friend_id
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


# Remove Zero Location

In [6]:
df_gowalla_checkins = df_gowalla_checkins[
    (df_gowalla_checkins["latitude"] != 0) & (df_gowalla_checkins["longitude"] != 0)
]

# Remove NaN

In [7]:
df_gowalla_checkins.dropna(inplace=True)

# GeoDataFrame

In [8]:
gpd_gowalla_checkins = gpd.GeoDataFrame(
    df_gowalla_checkins,
    geometry=gpd.points_from_xy(
        df_gowalla_checkins["longitude"], df_gowalla_checkins["latitude"]
    ),
    crs="EPSG:4326",
)

In [9]:
gpd_gowalla_checkins.head()

Unnamed: 0,user_id,check_in_time,latitude,longitude,location_id,geometry
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847,POINT (-97.79514 30.23591)
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315,POINT (-97.74940 30.26910)
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637,POINT (-97.76339 30.25573)
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516,POINT (-97.75760 30.26342)
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878,POINT (-97.74052 30.27429)


# Data Aggregation by User

In [10]:
df_gowalla_checkins_aggregated = gpd_gowalla_checkins.groupby(by="user_id").aggregate(
    list
)

In [11]:
df_gowalla_checkins_aggregated.head()

Unnamed: 0_level_0,check_in_time,latitude,longitude,location_id,geometry
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[2010-10-19T23:55:27Z, 2010-10-18T22:17:43Z, 2...","[30.2359091167, 30.2691029532, 30.2557309927, ...","[-97.7951395833, -97.7493953705, -97.763385772...","[22847, 420315, 316637, 16516, 5535878, 15372,...","[POINT (-97.7951395833 30.2359091167), POINT (..."
1,"[2010-07-25T11:40:18Z, 2010-07-24T12:21:43Z, 2...","[48.86337875, 45.202483667, 44.724745941, 44.9...","[2.333328717, -0.746830983, 5.03296136, 5.7842...","[1500177, 1493267, 1441698, 1436795, 1431949, ...","[POINT (2.333328717 48.86337875), POINT (-0.74..."
2,"[2010-10-21T00:03:50Z, 2010-10-20T17:18:59Z, 2...","[34.0430230998, 34.0172734606, 34.0172734606, ...","[-118.2671570778, -118.447508812, -118.4475088...","[14637, 59838, 59838, 59838, 1474903, 335197, ...","[POINT (-118.2671570778 34.0430230998), POINT ..."
4,"[2010-10-11T20:30:23Z, 2010-10-02T18:58:55Z, 2...","[37.7826046833, 37.7868705523, 37.7826583333, ...","[-122.4076080167, -122.4549742274, -122.406320...","[14608, 640452, 166197, 26645, 102499, 1582288...","[POINT (-122.4076080167 37.7826046833), POINT ..."
5,"[2010-09-06T18:29:13Z, 2010-09-06T13:54:52Z, 2...","[40.761176868, 40.7637712634, 40.7563951, 40.7...","[-73.9868709323, -73.9777493477, -73.98525065,...","[458146, 490887, 906408, 84240, 906408, 23261,...","[POINT (-73.9868709323 40.761176868), POINT (-..."


# Data Filtration
LineString cannot be initialized with one point

In [12]:
df_gowalla_checkins_filtered = df_gowalla_checkins_aggregated[
    df_gowalla_checkins_aggregated["geometry"].str.len() > 1
]

# User Friendship

In [13]:
df_gowalla_edges_filtered = df_gowalla_edges[
    df_gowalla_edges["user_id"].isin(df_gowalla_checkins_filtered.index)
    & df_gowalla_edges["user_friend_id"].isin(df_gowalla_checkins_filtered.index)
]

In [14]:
df_gowalla_edges_aggregated = df_gowalla_edges_filtered.groupby("user_id")[
    "user_friend_id"
].aggregate(list)

In [15]:
df_gowalla_merged = pd.merge(
    df_gowalla_checkins_filtered, df_gowalla_edges_aggregated, on="user_id"
)

# Convert List of Points to LineString

In [16]:
df_gowalla_merged["geometry"] = process_map(
    LineString, df_gowalla_merged["geometry"], chunksize=1000, max_workers=20
)

  0%|          | 0/92409 [00:00<?, ?it/s]

In [17]:
df_gowalla_merged.head()

Unnamed: 0_level_0,check_in_time,latitude,longitude,location_id,geometry,user_friend_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"[2010-10-19T23:55:27Z, 2010-10-18T22:17:43Z, 2...","[30.2359091167, 30.2691029532, 30.2557309927, ...","[-97.7951395833, -97.7493953705, -97.763385772...","[22847, 420315, 316637, 16516, 5535878, 15372,...","LINESTRING (-97.7951395833 30.2359091167, -97....","[1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, ..."
1,"[2010-07-25T11:40:18Z, 2010-07-24T12:21:43Z, 2...","[48.86337875, 45.202483667, 44.724745941, 44.9...","[2.333328717, -0.746830983, 5.03296136, 5.7842...","[1500177, 1493267, 1441698, 1436795, 1431949, ...","LINESTRING (2.333328717 48.86337875, -0.746830...","[0, 2, 9, 52, 53, 55, 68, 88, 97, 111, 116, 15..."
2,"[2010-10-21T00:03:50Z, 2010-10-20T17:18:59Z, 2...","[34.0430230998, 34.0172734606, 34.0172734606, ...","[-118.2671570778, -118.447508812, -118.4475088...","[14637, 59838, 59838, 59838, 1474903, 335197, ...","LINESTRING (-118.2671570778 34.0430230998, -11...","[0, 1, 5, 22, 36, 37, 41, 44, 53, 66, 67, 82, ..."
4,"[2010-10-11T20:30:23Z, 2010-10-02T18:58:55Z, 2...","[37.7826046833, 37.7868705523, 37.7826583333, ...","[-122.4076080167, -122.4549742274, -122.406320...","[14608, 640452, 166197, 26645, 102499, 1582288...","LINESTRING (-122.4076080167 37.7826046833, -12...","[0, 154, 191, 234, 235, 267, 347, 405, 464, 55..."
5,"[2010-09-06T18:29:13Z, 2010-09-06T13:54:52Z, 2...","[40.761176868, 40.7637712634, 40.7563951, 40.7...","[-73.9868709323, -73.9777493477, -73.98525065,...","[458146, 490887, 906408, 84240, 906408, 23261,...","LINESTRING (-73.9868709323 40.761176868, -73.9...","[0, 2, 40, 41, 42, 44, 54, 67, 82, 89, 112, 11..."


In [18]:
gpd_gowalla_checkins = gpd.GeoDataFrame(
    data=df_gowalla_merged,
    geometry="geometry",
    crs="EPSG:4326",
)

# Remove redundant columns

In [19]:
gpd_gowalla_checkins.drop(columns=["latitude", "longitude"], axis=1, inplace=True)

# Convert LineString to List of Points (HF format)

In [20]:
gpd_gowalla_checkins["geometry"] = gpd_gowalla_checkins["geometry"].apply(
    lambda geom: list(geom.coords)
)

  gpd_gowalla_checkins["geometry"] = gpd_gowalla_checkins["geometry"].apply(


# Save to Parquet

In [21]:
gpd_gowalla_checkins.to_parquet("data/gowalla.parquet")