# Step 1: Load NYC dataset


In [1]:
import pandas as pd

DATA_PATH = "dataset/dataset_TSMC2014_NYC.txt"

USECOLS = [0, 4, 5, 7]
COLUMNS = [
    "user_id",
    "latitude",
    "longitude",
    "utc_time",
]

raw_df = pd.read_csv(
    DATA_PATH,
    sep="\t",
    header=None,
    usecols=USECOLS,
    names=COLUMNS,
    engine="python",
    on_bad_lines="skip",
)

# Step 2: Data Cleaning

In [None]:
clean_df = raw_df.copy()

clean_df["latitude"] = pd.to_numeric(clean_df["latitude"], errors="coerce")
clean_df["longitude"] = pd.to_numeric(clean_df["longitude"], errors="coerce")
clean_df["utc_time"] = pd.to_datetime(
    clean_df["utc_time"],
    format="%a %b %d %H:%M:%S %z %Y",
    errors="coerce",
)

clean_df = clean_df.dropna(subset=["latitude", "longitude", "utc_time"])
clean_df = clean_df[clean_df["latitude"].between(-90, 90)]
clean_df = clean_df[clean_df["longitude"].between(-180, 180)]

clean_df = clean_df.drop_duplicates(subset=["user_id", "latitude", "longitude", "utc_time"])
clean_df = clean_df.sort_values("utc_time").reset_index(drop=True)

print(f"Raw rows: {len(raw_df):,}")
print(f"Clean rows: {len(clean_df):,}")
clean_df.head()


Raw rows: 227,428
Clean rows: 227,165


Unnamed: 0,user_id,latitude,longitude,utc_time
0,470,40.71981,-74.002581,2012-04-03 18:00:09+00:00
1,979,40.6068,-74.04417,2012-04-03 18:00:25+00:00
2,69,40.716162,-73.88307,2012-04-03 18:02:24+00:00
3,395,40.745164,-73.982519,2012-04-03 18:02:41+00:00
4,87,40.740104,-73.989658,2012-04-03 18:03:00+00:00


#Step 3: Offline Process