# CalIt2 building people counts dataset

In [1]:
import pandas as pd
import os
from typing import Final
from config import data_raw_folder, data_processed_folder
from timeeval import Datasets
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 10)

In [2]:
dataset_collection_name = "CalIt2"
source_folder = os.path.join(data_raw_folder, "UCI ML Repository/CalIt2")
target_folder = data_processed_folder

from pathlib import Path
print(f"Looking for source datasets in {Path(source_folder).absolute()} and\nsaving processed datasets in {Path(target_folder).absolute()}")

Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/UCI ML Repository/CalIt2 and
saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed


In [3]:
dataset_name = "CalIt2-traffic"
train_type = "unsupervised"
train_is_normal = False
input_type = "multivariate"
datetime_index = True
dataset_type = "real"

# create target directory
dataset_subfolder = os.path.join(input_type, dataset_collection_name)
target_subfolder = os.path.join(target_folder, dataset_subfolder)
try:
    os.makedirs(target_subfolder)
    print(f"Created directories {target_subfolder}")
except FileExistsError:
    print(f"Directories {target_subfolder} already exist")
    pass

dm = Datasets(target_folder)

Directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/CalIt2 already exist


In [4]:
# transform data 
df = pd.read_csv(os.path.join(source_folder, "CalIt2.data"), header=None)
df.columns = ["id", "date", "time", "count"]
df.insert(0, "timestamp", pd.to_datetime(df["date"] + " " + df["time"]))
df = df.drop(columns=["date", "time"])

# in flow
df_in = df[df["id"] == 9]
df_in = df_in.drop(columns=["id"])
df_in.columns = ["timestamp", "in_count"]

# out flow
df_out = df[df["id"] == 7]
df_out = df_out.drop(columns=["id"])
df_out.columns = ["timestamp", "out_count"]

df = pd.merge(df_in, df_out, on="timestamp", how="inner")

# read and add labels
df_events = pd.read_csv(os.path.join(source_folder, "CalIt2.events"), header=None)
df_events.columns = ["date", "begin", "end", "event_type"]
df_events.insert(0, "begin_timestamp", pd.to_datetime(df_events["date"] + " " + df_events["begin"]))
df_events.insert(1, "end_timestamp", pd.to_datetime(df_events["date"] + " " + df_events["end"]))
df_events = df_events.drop(columns=["date", "begin", "end", "event_type"])
# labelling
df["is_anomaly"] = 0
for _, (t1, t2) in df_events.iterrows():
    tmp = df[df["timestamp"] >= t1]
    tmp = tmp[tmp["timestamp"] <= t2]
    df.loc[tmp.index, "is_anomaly"] = 1

filename = f"{dataset_name}.test.csv"
path = os.path.join(dataset_subfolder, filename)
target_filepath = os.path.join(target_subfolder, filename)
dataset_length = len(df)
df.to_csv(target_filepath, index=False)
print(f"Processed dataset {dataset_name} -> {target_filepath}")

# save metadata
dm.add_dataset((dataset_collection_name, dataset_name),
    train_path = None,
    test_path = path,
    dataset_type = dataset_type,
    datetime_index = datetime_index,
    split_at = None,
    train_type = train_type,
    train_is_normal = train_is_normal,
    input_type = input_type,
    dataset_length = dataset_length
)

dm.save()

Processed dataset CalIt2-traffic -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/CalIt2/CalIt2-traffic.test.csv


In [7]:
dm.refresh()
dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]

Unnamed: 0_level_0,Unnamed: 1_level_0,train_path,test_path,dataset_type,datetime_index,split_at,train_type,train_is_normal,input_type,length
collection_name,dataset_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CalIt2,CalIt2-traffic,,multivariate/CalIt2/CalIt2-traffic.test.csv,real,True,,unsupervised,False,multivariate,5040


## Experimentation

In [None]:
df = pd.read_csv(os.path.join(source_folder, "CalIt2.data"), header=None)
df.columns = ["id", "date", "time", "count"]
df.insert(0, "timestamp", pd.to_datetime(df["date"] + " " + df["time"]))
df = df.drop(columns=["date", "time"])

# in flow
df_in = df[df["id"] == 9]
df_in = df_in.drop(columns=["id"])
df_in.columns = ["timestamp", "in_count"]

# out flow
df_out = df[df["id"] == 7]
df_out = df_out.drop(columns=["id"])
df_out.columns = ["timestamp", "out_count"]

df = pd.merge(df_in, df_out, on="timestamp", how="inner")
df

In [None]:
df_events = pd.read_csv(os.path.join(source_folder, "CalIt2.events"), header=None)
df_events.columns = ["date", "begin", "end", "event_type"]
df_events.insert(0, "begin_timestamp", pd.to_datetime(df_events["date"] + " " + df_events["begin"]))
df_events.insert(1, "end_timestamp", pd.to_datetime(df_events["date"] + " " + df_events["end"]))
df_events = df_events.drop(columns=["date", "begin", "end", "event_type"])
df_events

In [None]:
# labelling
df["is_anomaly"] = 0
for _, (t1, t2) in df_events.iterrows():
    tmp = df[df["timestamp"] >= t1]
    tmp = tmp[tmp["timestamp"] <= t2]
    df.loc[tmp.index, "is_anomaly"] = 1
df

In [None]:
df_plot = df.iloc[2500:3000].copy()
df_plot = df_plot.set_index("timestamp")
df_plot.plot(y=["in_count", "out_count"])
df_plot["is_anomaly"].plot(secondary_y=True)
plt.legend()
plt.show()