In [18]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import sys

# Add project root to path - adjust the number of parent dirs (..) based on where your notebook is located
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)


from src.datahandlers import TrajectoryDataModule, Trajectory, PacmanDataReader
import src.utils as utils


from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Handling
Data reading, filtering, transformation, etc. will be handled by the `PacmanDataReader` class. This is a singleton class that will be initialized once and then reused whenever needed (e.g, by visualization functions).

When initialized, the `PacmanDataReader` class will read the data from the data folder and filter out banned users.
5 dataframes are created:
- `game_df`: contains the game metadata
- `gamestate_df`: contains the gamestate data
- `user_df`: contains the user metadata
- `ip_df`: contains the ip metadata
- `redcap_df`: contains the redcap data
- `psychometrics_df`: contains the psychometric data


In [19]:
data = PacmanDataReader(data_folder="../data/", read_games_only=False, verbose=False)
data.gamestate_df.columns

Index(['game_state_id', 'level_id', 'time_elapsed', 'score', 'lives',
       'pacman_attack', 'input_direction', 'movement_direction', 'Pacman_X',
       'Pacman_Y', 'Ghost1_X', 'Ghost1_Y', 'Ghost2_X', 'Ghost2_Y', 'Ghost3_X',
       'Ghost3_Y', 'Ghost4_X', 'Ghost4_Y', 'ghost1_state', 'ghost2_state',
       'ghost3_state', 'ghost4_state', 'powerPellets', 'pellets',
       'powerpelletstate_1', 'powerpelletstate_2', 'powerpelletstate_3',
       'powerpelletstate_4', 'fruitState_1', 'fruitState_2'],
      dtype='object')

Attribute `self.gamestate_df` contains data as logged and retreived from the SQL database.


In [20]:
data.gamestate_df.head()

Unnamed: 0,game_state_id,level_id,time_elapsed,score,lives,pacman_attack,input_direction,movement_direction,Pacman_X,Pacman_Y,...,ghost3_state,ghost4_state,powerPellets,pellets,powerpelletstate_1,powerpelletstate_2,powerpelletstate_3,powerpelletstate_4,fruitState_1,fruitState_2
3955,220049,388,0.19,10,3,0,right,right,1.485321,-9.489613,...,0,0,4,243,1,1,1,1,0,0
3956,220050,388,0.23,20,3,0,none,right,1.939881,-9.489613,...,0,0,4,242,1,1,1,1,0,0
3957,220051,388,0.29,20,3,0,none,right,2.240156,-9.503768,...,0,0,4,242,1,1,1,1,0,0
3958,220052,388,0.33,20,3,0,none,right,2.694716,-9.500579,...,0,0,4,242,1,1,1,1,0,0
3959,220053,388,0.4,30,3,0,none,right,2.997756,-9.499909,...,0,0,4,241,1,1,1,1,0,0


In [21]:


data.game_df.head()

data.level_df.head()

Unnamed: 0_level_0,game_id,user_id,session_number,total_levels_played,source,date_played,game_duration,max_level,max_score,total_games_played,game_in_session,level_ids
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
389,389,64,1,2,DataGathering,2024-11-29 14:13:35,90.855,2,6340,1,1,"[388, 389]"
391,391,64,1,4,DataGathering,2024-11-29 14:15:57,104.35,2,6670,2,2,"[390, 391]"
393,393,64,1,6,DataGathering,2024-11-29 14:18:12,132.04,2,7220,3,3,"[392, 393]"
397,397,67,1,4,DataGathering,2024-12-02 14:50:13,203.454,4,12960,1,1,"[394, 395, 396, 397]"
398,398,69,1,1,DataGathering,2024-12-02 15:14:58,59.033,1,2520,1,1,[398]


Unnamed: 0_level_0,level_id,user_id,session_number,level_in_session,total_levels_played,source,date_played,duration,win,level,max_score,game_id
level_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
388,388,64,1,1,1,DataGathering,2024-11-29 14:13:35,75.874,1,1,4590,389
389,389,64,1,2,2,DataGathering,2024-11-29 14:14:58,14.981,0,2,6340,389
390,390,64,1,3,3,DataGathering,2024-11-29 14:15:57,55.516,1,1,3390,391
391,391,64,1,4,4,DataGathering,2024-11-29 14:17:00,48.834,0,2,6670,391
392,392,64,1,5,5,DataGathering,2024-11-29 14:18:12,80.657,1,1,4490,393


## The Trajectory object

To structure the analysis in different stages, we implemented a specific dataclass for the gamestate data, the `Trajectory` class. 
This class represents pacman trajectory with both spatial data and metadata. It behaves as an `np.ndarray` for the (x,y) coordinates of Pacman. But the class can be progressivly extended to include other elements of the gamestate dataframe, such as time_elapsed, ghosts positions, input directions, etc.

The methods `self.get_trajectory_array()` and `self.get_partial_trajectory_array()` are used to get trajectories in form of an `Trajectory` instance, suitable for mathematical analysis of player movement, but also integrating other game metadata.

In [22]:
trajectory = data.get_trajectory(level_id=600)

type(trajectory)

np.array(trajectory).shape

trajectory.get_segment(0, 50).__len__()

np.array_equal(
    trajectory.get_segment(0, 50),
    data.get_partial_trajectory(level_id=600, start_timestep=0, end_timestep=50),
)  ## Both method return the same. datareader class serves as a higher level api to get segments/games based on game_ids or users

src.datahandlers.trajectory.Trajectory

(1535, 2)

50

True

In [23]:
trajectory = data.get_trajectory(user_id=78)

trajectory.metadata['user_id']


np.int64(78)

We can also save long lists of trajectories, if necessary.

In [24]:
## Getting the first 50 steps of every game
games = data.game_df["game_id"].to_list()
subtraj_lists = []

for game in games:
    traj = data.get_trajectory(level_id=game, include_metadata=True)
    subtraj = traj.get_segment(0, 50)
    subtraj_lists.append(subtraj)

subtraj_lists.__len__()

506

In [25]:
# Save trajectories
Trajectory.save_trajectories(subtraj_lists, "test_partial_trajectories")

# Later, load trajectories
loaded_trajectories = Trajectory.load_trajectories("test_partial_trajectories")

# Use loaded trajectories
# for traj in loaded_trajectories:
#    print(f"Game ID: {traj.metadata['game_id']}")
#    print(f"Shape: {traj.coordinates.shape}")

## The Datamodule (For pytorch)

A `TrajectoryDataModule(pl.LightningDataModule)` is implemented to structure the data into a properly shaped `torch.Tensor` for ML models (i.e., autoencoders) 

 Method `self.get_trajectory_dataframe()` and `self.filter_gamestate_data()` are used internally to setup the dataset for model training

In [26]:
pacman_ts = data.get_trajectory_dataframe(series_type=["position"])

pacman_ts.head()

datamodule = TrajectoryDataModule(data_folder="../data/")

tensordf, mask, game_idx = datamodule._create_game_trajectory_tensor(
    trajectories_df=pacman_ts
)

tensordf.shape

Unnamed: 0,level_id,time_elapsed,Pacman_X,Pacman_Y
3955,388,0.19,1.485321,-9.489613
3956,388,0.23,1.939881,-9.489613
3957,388,0.29,2.240156,-9.503768
3958,388,0.33,2.694716,-9.500579
3959,388,0.4,2.997756,-9.499909


torch.Size([879, 2242, 3])

## Observations on data
Observations regarding trajectories, most of them can be seen in the example below:
- Trajectories don't necessarily start at the same time or position (even though they should). 
- Trajectories don't necessarily have the same length.
- Sampling time is not consistent. It should be every 50 ms, but sometimes it's shorter or longer. However, weirdly the timing between two consecutive samples is consistent (100 ms) (i.e, if one sample has a dt of 0.03 s, the next one will have a dt of 0.07 s).
- Velocities are not the same, they vary between levels (i.e, level 2 is faster than level 1).


In [27]:
### Inspecting raw values of velocity and time
ts1 = data.get_trajectory_dataframe(level_id=993).reset_index(
    drop=True
)  # game 993 is played on level 4

ts1["dx"], ts1["dy"] = utils.calculate_velocities(
    np.array(ts1[["Pacman_X", "Pacman_Y"]]), round=False
)

ts1["dt"] = np.diff(ts1["time_elapsed"], prepend=ts1["time_elapsed"].iloc[0])

ts1["dx"] = ts1["dx"] / ts1["dt"]
ts1["dy"] = ts1["dy"] / ts1["dt"]

ts2 = data.get_trajectory_dataframe(level_id=990).reset_index(
    drop=True
)  # game 990 is played on level 1

ts2["dx"], ts2["dy"] = utils.calculate_velocities(
    np.array(ts2[["Pacman_X", "Pacman_Y"]]), round=False
)

ts2["dt"] = np.diff(ts2["time_elapsed"], prepend=ts2["time_elapsed"].iloc[0])
ts2["dx"] = ts2["dx"] / ts2["dt"]
ts2["dy"] = ts2["dy"] / ts2["dt"]


ds = pd.concat([ts1, ts2], axis=1).reset_index(drop=True)
ds

Unnamed: 0,level_id,time_elapsed,Pacman_X,Pacman_Y,dx,dy,dt,level_id.1,time_elapsed.1,Pacman_X.1,Pacman_Y.1,dx.1,dy.1,dt.1
0,993.0,0.18,1.534140,-9.326811,,,0.00,990,0.200000,1.485321,-9.489613,,,0.000000
1,993.0,0.23,1.476429,-9.005260,-1.154215,6.431008,0.05,990,0.250000,1.788361,-9.489613,6.060801,0.000000,0.050000
2,993.0,0.28,1.499031,-8.494056,0.452032,10.224095,0.05,990,0.300000,2.240156,-9.503768,9.035907,-0.283108,0.050000
3,993.0,0.33,1.499931,-8.153136,0.018005,6.818388,0.05,990,0.350000,2.543196,-9.501216,6.060803,0.051041,0.050000
4,993.0,0.38,1.500392,-7.641757,0.009217,10.227588,0.05,990,0.390000,2.997756,-9.499909,11.364003,0.032663,0.040000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1119,,,,,,,,990,56.139999,10.500672,-10.714727,-0.011095,15.041137,0.029999
1120,,,,,,,,990,56.209999,10.500604,-10.411688,-0.000981,4.329155,0.070000
1121,,,,,,,,990,56.240002,10.500568,-9.957129,-0.001176,15.150668,0.030003
1122,,,,,,,,990,56.310001,10.502164,-9.654089,0.022793,4.329155,0.070000
