In [11]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.1


In [6]:
import requests
import pandas as pd
import os

In [7]:
# @title Download and store the dataset

# URL of the dataset text file on GitHub
text_file_url = 'https://raw.githubusercontent.com/Paco-Danes/DeepPoi/main/dataset_NYC.txt'

# The directory to store the file in the colab session
directory_path = '/content/dataset/'
file_path = os.path.join(directory_path, 'example_dataset.txt')

# Check if directory exists, if not, create it
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Download the file
r = requests.get(text_file_url)

# Save the content to a file
with open(file_path, 'w') as f:
    f.write(r.text)

print("Text file downloaded and saved to:", file_path)


Text file downloaded and saved to: /content/dataset/example_dataset.txt


In [37]:
# Read the TSV file into a DataFrame
df = pd.read_csv(file_path, sep='\t')
# Column names
column_names = [
    'User_ID',
    'Venue_ID',
    'Venue_Category_ID',
    'Venue_Category_Name',
    'Latitude',
    'Longitude',
    'Timezone_Offset',
    'UTC_Time'
]
# Assign column names to the DataFrame
df.columns = column_names
# Shape check of the DataFrame
df.shape

(227427, 8)

In [38]:
df.head()

Unnamed: 0,User_ID,Venue_ID,Venue_Category_ID,Venue_Category_Name,Latitude,Longitude,Timezone_Offset,UTC_Time
0,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
1,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
2,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
3,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012
4,484,4b5b981bf964a520900929e3,4bf58dd8d48988d118951735,Food & Drink Shop,40.690427,-73.954687,-240,Tue Apr 03 18:04:00 +0000 2012


In [39]:
# One-hot encode the 'venue_categories' column
# Calculate the minimum and maximum values for latitude and longitude
min_latitude = df['Latitude'].min()
max_latitude = df['Latitude'].max()
min_longitude = df['Longitude'].min()
max_longitude = df['Longitude'].max()

# Min-max normalize latitude and longitude columns
df['Norm_Latitude'] = (df['Latitude'] - min_latitude) / (max_latitude - min_latitude)
df['Norm_Longitude'] = (df['Longitude'] - min_longitude) / (max_longitude - min_longitude)
df['Venue_ID_int'], _ = pd.factorize(df['Venue_ID'])

encoded_df = pd.get_dummies(df['Venue_Category_Name'])

# Drop the original categorical column
df = pd.concat([df.drop(['Venue_Category_Name','Venue_Category_ID', 'Latitude', 'Longitude', 'Venue_ID'], axis=1), encoded_df], axis=1)
print(df.shape)
df.head()

(227427, 257)


Unnamed: 0,User_ID,Timezone_Offset,UTC_Time,Norm_Latitude,Norm_Longitude,Venue_ID_int,Afghan Restaurant,African Restaurant,Airport,American Restaurant,...,Travel Lounge,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Winery,Wings Joint,Zoo
0,979,-240,Tue Apr 03 18:00:25 +0000 2012,0.127885,0.390219,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,69,-240,Tue Apr 03 18:02:24 +0000 2012,0.377868,0.662835,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,395,-240,Tue Apr 03 18:02:41 +0000 2012,0.444161,0.494546,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,87,-240,Tue Apr 03 18:03:00 +0000 2012,0.432595,0.482464,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,484,-240,Tue Apr 03 18:04:00 +0000 2012,0.319043,0.541644,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df['Timezone_Offset'].unique()

array([-240,  540,  120, -300, -420,  420,  480, -360, -180,  660])