In [48]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import HDBSCAN

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.feature_selection import RFECV

In [6]:
dfs = [pd.read_parquet('../Outputs/cleaned-5-player-features-camera-rounds.parquet'),
       pd.read_parquet('../Outputs/cleaned-5-player-features-3d-rounds.parquet'),
       pd.read_parquet('../Outputs/cleaned-5-player-features-camera-halves.parquet'),
       pd.read_parquet('../Outputs/cleaned-5-player-features-3d-halves.parquet')
       ]

In [7]:
# Add column for the number of seconds players are alive to rounds dfs
for i in range(5):
    dfs[0][f'p{i+1}_time_alive'] = (dfs[0][f'p{i+1}_num_samples_d'] + dfs[0][f'p{i+1}_num_samples_j']) / 2
    dfs[1][f'p{i+1}_time_alive'] = (dfs[1][f'p{i+1}_num_samples_d'] + dfs[1][f'p{i+1}_num_samples_j']) / 2

In [8]:
half_metrics = [
    'ch_area', 'ch_volume', 'ch_area_normed', 'ch_volume_normed',
    'frac_dim',
    'C1', 'C2', 'C3', 'C4', 'C5',
    'alpha_d', 'alpha_j', 'num_samples_d','num_samples_j',
]

round_metrics = half_metrics + ['time_alive']

# Statistic suffixes
stats = ['min', 'max', 'mean',  'std']

aggregate_dfs = []
for i in range(len(dfs)):
       if i <= 1:
              metrics = round_metrics
              # List comprehension for output column names
              output_columns = [f"{metric}_{stat}" for metric in metrics for stat in stats]

              # Iterate through each metric and calculate row-wise stats
              for metric in metrics:
                     cols = [f'p{i+1}_{metric}' for i in range(5)]

                     # Convert to a 2D array for row-wise operation
                     data = dfs[i][cols].to_numpy()

                     # Add new columns with row-wise aggregations
                     dfs[i][f'{metric}_min'] = np.min(data, axis=1)
                     dfs[i][f'{metric}_max'] = np.max(data, axis=1)
                     dfs[i][f'{metric}_mean'] = np.mean(data, axis=1)
                     dfs[i][f'{metric}_std'] = np.std(data, axis=1)
              aggregate_dfs.append(dfs[i][['matchID', 'mapName', 'side', 'roundNum', 'Label'] + output_columns])
       else:
              metrics = half_metrics
              # List comprehension for output column names
              output_columns = [f"{metric}_{stat}" for metric in metrics for stat in stats]

              # Iterate through each metric and calculate row-wise stats
              for metric in metrics:
                     cols = [f'p{i+1}_{metric}' for i in range(5)]

                     # Convert to a 2D array for row-wise operation
                     data = dfs[i][cols].to_numpy()

                     # Add new columns with row-wise aggregations
                     dfs[i][f'{metric}_min'] = np.min(data, axis=1)
                     dfs[i][f'{metric}_max'] = np.max(data, axis=1)
                     dfs[i][f'{metric}_mean'] = np.mean(data, axis=1)
                     dfs[i][f'{metric}_std'] = np.std(data, axis=1)
              aggregate_dfs.append(dfs[i][['matchID', 'mapName', 'side', 'team', 'Label'] + output_columns])

In [9]:
# Drop any rounds where a player was alive for less than 30 seconds in the rounds dfs
aggregate_dfs[0] = aggregate_dfs[0].loc[aggregate_dfs[0].time_alive_min >= 30]
aggregate_dfs[1] = aggregate_dfs[1].loc[aggregate_dfs[1].time_alive_min >= 30]

# Create the merged 3D and Camera Halves df as well
aggregate_dfs.append(pd.merge(aggregate_dfs[3], aggregate_dfs[2], on=['matchID', 'mapName', 'side', 'team', 'Label'], suffixes=('_3d', '_cam')))
aggregate_dfs.append(pd.merge(aggregate_dfs[1], aggregate_dfs[0], on=['matchID', 'mapName', 'side', 'roundNum', 'Label'], suffixes=('_3d', '_cam')))

In [10]:
# Make a dict
dfs_dict  = {'Camera Rounds': aggregate_dfs[0], '3D Rounds': aggregate_dfs[1], 'Camera Halves': aggregate_dfs[2], '3D Halves': aggregate_dfs[3], '3D & Camera Halves': aggregate_dfs[4], '3D & Camera Rounds': aggregate_dfs[5]}

In [25]:
# Rounds are too short and too noisy to accurately model the journey / dwell distributions
# Halves are long enough where the power law behavior emerges
# With cameras they move much faster so it is not reasonable to detect when they stop accurately leading to non decay fits for journey often
# 3D Halves show considerable agreement with human mobility patterns and there is enough data so this makes sense
for name, df in list(dfs_dict.items())[:4]:
    print(name)
    print(df.shape)
    print(df.loc[df.alpha_j_max > 1].shape)
    print(df.loc[df.alpha_d_max > 1].shape)
    print(df.loc[(df.alpha_d_max <= 1) & (df.alpha_j_max <= 1)].shape)


Camera Rounds
(10875, 65)
(9255, 65)
(8248, 65)
(688, 65)
3D Rounds
(17800, 65)
(10456, 65)
(10158, 65)
(4009, 65)
Camera Halves
(1357, 61)
(183, 61)
(11, 61)
(1172, 61)
3D Halves
(1359, 61)
(2, 61)
(1, 61)
(1356, 61)


In [29]:
# Dictionary of feature sets for various ablation studies
feature_sets = {
    "All": [
        'ch_area_min', 'ch_area_max', 'ch_area_mean', 'ch_area_std',
        'ch_volume_min', 'ch_volume_max', 'ch_volume_mean', 'ch_volume_std',
        'ch_area_normed_min', 'ch_area_normed_max', 'ch_area_normed_mean',
        'ch_area_normed_std', 'ch_volume_normed_min', 'ch_volume_normed_max',
        'ch_volume_normed_mean', 'ch_volume_normed_std', 'frac_dim_min',
        'frac_dim_max', 'frac_dim_mean', 'frac_dim_std', 'C1_min', 'C1_max',
        'C1_mean', 'C1_std', 'C2_min', 'C2_max', 'C2_mean', 'C2_std', 'C3_min',
        'C3_max', 'C3_mean', 'C3_std', 'C4_min', 'C4_max', 'C4_mean', 'C4_std',
        'C5_min', 'C5_max', 'C5_mean', 'C5_std', 'alpha_d_min', 'alpha_d_max',
        'alpha_d_mean', 'alpha_d_std', 'alpha_j_min', 'alpha_j_max',
        'alpha_j_mean', 'alpha_j_std'
    ],

    "No regular convex hull features": [
        'ch_area_normed_min', 'ch_area_normed_max', 'ch_area_normed_mean',
        'ch_area_normed_std', 'ch_volume_normed_min', 'ch_volume_normed_max',
        'ch_volume_normed_mean', 'ch_volume_normed_std', 'frac_dim_min',
        'frac_dim_max', 'frac_dim_mean', 'frac_dim_std', 'C1_min', 'C1_max',
        'C1_mean', 'C1_std', 'C2_min', 'C2_max', 'C2_mean', 'C2_std', 'C3_min',
        'C3_max', 'C3_mean', 'C3_std', 'C4_min', 'C4_max', 'C4_mean', 'C4_std',
        'C5_min', 'C5_max', 'C5_mean', 'C5_std', 'alpha_d_min', 'alpha_d_max',
        'alpha_d_mean', 'alpha_d_std', 'alpha_j_min', 'alpha_j_max',
        'alpha_j_mean', 'alpha_j_std'
    ],
    "Only regular convex hull features": [
        'ch_area_min', 'ch_area_max', 'ch_area_mean', 'ch_area_std',
        'ch_volume_min', 'ch_volume_max', 'ch_volume_mean', 'ch_volume_std'
    ],

    "No convex hull features at all": [
        'frac_dim_min', 'frac_dim_max', 'frac_dim_mean', 'frac_dim_std',
        'C1_min', 'C1_max', 'C1_mean', 'C1_std', 'C2_min', 'C2_max', 'C2_mean', 'C2_std',
        'C3_min', 'C3_max', 'C3_mean', 'C3_std', 'C4_min', 'C4_max', 'C4_mean', 'C4_std',
        'C5_min', 'C5_max', 'C5_mean', 'C5_std', 'alpha_d_min', 'alpha_d_max',
        'alpha_d_mean', 'alpha_d_std', 'alpha_j_min', 'alpha_j_max', 'alpha_j_mean', 'alpha_j_std'
    ],
    "Only convex hull features": [
        'ch_area_min', 'ch_area_max', 'ch_area_mean', 'ch_area_std',
        'ch_volume_min', 'ch_volume_max', 'ch_volume_mean', 'ch_volume_std',
        'ch_area_normed_min', 'ch_area_normed_max', 'ch_area_normed_mean',
        'ch_area_normed_std', 'ch_volume_normed_min', 'ch_volume_normed_max',
        'ch_volume_normed_mean', 'ch_volume_normed_std'
    ],

    "No normalized convex hull features": [
        'ch_area_min', 'ch_area_max', 'ch_area_mean', 'ch_area_std',
        'ch_volume_min', 'ch_volume_max', 'ch_volume_mean', 'ch_volume_std',
        'frac_dim_min', 'frac_dim_max', 'frac_dim_mean', 'frac_dim_std',
        'C1_min', 'C1_max', 'C1_mean', 'C1_std', 'C2_min', 'C2_max', 'C2_mean', 'C2_std',
        'C3_min', 'C3_max', 'C3_mean', 'C3_std', 'C4_min', 'C4_max', 'C4_mean', 'C4_std',
        'C5_min', 'C5_max', 'C5_mean', 'C5_std', 'alpha_d_min', 'alpha_d_max',
        'alpha_d_mean', 'alpha_d_std', 'alpha_j_min', 'alpha_j_max', 'alpha_j_mean', 'alpha_j_std'
    ],
    "Only normalized convex hull features": [
        'ch_area_normed_min', 'ch_area_normed_max', 'ch_area_normed_mean',
        'ch_area_normed_std', 'ch_volume_normed_min', 'ch_volume_normed_max',
        'ch_volume_normed_mean', 'ch_volume_normed_std'
    ],

    "No entropy constant features": [
        'ch_area_min', 'ch_area_max', 'ch_area_mean', 'ch_area_std',
        'ch_volume_min', 'ch_volume_max', 'ch_volume_mean', 'ch_volume_std',
        'ch_area_normed_min', 'ch_area_normed_max', 'ch_area_normed_mean',
        'ch_area_normed_std', 'ch_volume_normed_min', 'ch_volume_normed_max',
        'ch_volume_normed_mean', 'ch_volume_normed_std', 'frac_dim_min',
        'frac_dim_max', 'frac_dim_mean', 'frac_dim_std',
        'alpha_d_min', 'alpha_d_max', 'alpha_d_mean', 'alpha_d_std',
        'alpha_j_min', 'alpha_j_max', 'alpha_j_mean', 'alpha_j_std'
    ],
    "Only entropy constant features": [
        'C1_min', 'C1_max', 'C1_mean', 'C1_std', 'C2_min', 'C2_max', 'C2_mean', 'C2_std',
        'C3_min', 'C3_max', 'C3_mean', 'C3_std', 'C4_min', 'C4_max', 'C4_mean', 'C4_std',
        'C5_min', 'C5_max', 'C5_mean', 'C5_std'
    ],

    "No fractal dimension features": [
        'ch_area_min', 'ch_area_max', 'ch_area_mean', 'ch_area_std',
        'ch_volume_min', 'ch_volume_max', 'ch_volume_mean', 'ch_volume_std',
        'ch_area_normed_min', 'ch_area_normed_max', 'ch_area_normed_mean',
        'ch_area_normed_std', 'ch_volume_normed_min', 'ch_volume_normed_max',
        'ch_volume_normed_mean', 'ch_volume_normed_std', 'C1_min', 'C1_max',
        'C1_mean', 'C1_std', 'C2_min', 'C2_max', 'C2_mean', 'C2_std', 'C3_min',
        'C3_max', 'C3_mean', 'C3_std', 'C4_min', 'C4_max', 'C4_mean', 'C4_std',
        'C5_min', 'C5_max', 'C5_mean', 'C5_std', 'alpha_d_min', 'alpha_d_max',
        'alpha_d_mean', 'alpha_d_std', 'alpha_j_min', 'alpha_j_max',
        'alpha_j_mean', 'alpha_j_std'
    ],
    "Only fractal dimension features": [
        'frac_dim_min', 'frac_dim_max', 'frac_dim_mean', 'frac_dim_std'
    ],

    "No alpha features": [
        'ch_area_min', 'ch_area_max', 'ch_area_mean', 'ch_area_std',
        'ch_volume_min', 'ch_volume_max', 'ch_volume_mean', 'ch_volume_std',
        'ch_area_normed_min', 'ch_area_normed_max', 'ch_area_normed_mean',
        'ch_area_normed_std', 'ch_volume_normed_min', 'ch_volume_normed_max',
        'ch_volume_normed_mean', 'ch_volume_normed_std', 'frac_dim_min',
        'frac_dim_max', 'frac_dim_mean', 'frac_dim_std', 'C1_min', 'C1_max',
        'C1_mean', 'C1_std', 'C2_min', 'C2_max', 'C2_mean', 'C2_std', 'C3_min',
        'C3_max', 'C3_mean', 'C3_std', 'C4_min', 'C4_max', 'C4_mean', 'C4_std',
        'C5_min', 'C5_max', 'C5_mean', 'C5_std'
    ],
    "Only alpha features": [
        'alpha_d_min', 'alpha_d_max', 'alpha_d_mean', 'alpha_d_std',
        'alpha_j_min', 'alpha_j_max', 'alpha_j_mean', 'alpha_j_std'
    ]
}

In [77]:
# Loop through selected DataFrames
for df_name, df in list(dfs_dict.items())[3:4]:
    all_features = feature_sets["All"]

    # Adjust features for 3D & Camera datasets
    if df_name in ['3D & Camera Halves', '3D & Camera Rounds']:
        all_features = [f + '_3d' for f in all_features] + [f + '_cam' for f in all_features]

    # Filter only '_mean' features
    all_features = [f for f in all_features if '_mean' in f]

    # Loop through ablation studies (feature sets)
    for name, feature_columns in list(feature_sets.items())[3:4]:
        # Adjust features if needed
        if df_name in ['3D & Camera Halves', '3D & Camera Rounds']:
            feature_columns = [f + '_3d' for f in feature_columns] + [f + '_cam' for f in feature_columns]

        feature_columns = [f for f in feature_columns if '_mean' in f]

        print(f"\n=== {df_name} - Feature Set: {name} ===")
        print(f"Feature count: {len(feature_columns)}")
        print(feature_columns)

        # Process each side individually
        for side in np.unique(df['side']):
            print(f"\n--- Side: {side} ---")

            # Filter by side
            df_side = df[df['side'] == side]

            # Extract features and labels
            X = df_side[feature_columns].to_numpy()
            le = LabelEncoder()
            y = le.fit_transform(df_side['Label'])

            # Train/test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42)

            # Normalize features
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # Clustering
            hdb = HDBSCAN(min_cluster_size=10)
            print(f"Training shape: {X_train.shape}")
            y_pred = hdb.fit_predict(X_train)
            print(f"Cluster labels: {np.unique(y_pred)}")

            for c in np.unique(y_pred):
                cluster_indices = np.where(y_pred == c)[0]
                print(f"Cluster {c}: {cluster_indices.shape[0]} samples")

                # Cluster label breakdown
                cluster_labels = y_train[cluster_indices]
                unique_elements, counts = np.unique(cluster_labels, return_counts=True)
                for i in range(len(unique_elements)):
                    label_name = le.inverse_transform([unique_elements[i]])[0]
                    proportion = counts[i] / np.sum(counts)
                    print(f"  {label_name}: {proportion:.2f}")

        # No side splitting
        print("\nNO SIDE SPLITTING VERSION")
        df_side = df
        X = df_side[feature_columns].to_numpy()
        le = LabelEncoder()
        y = le.fit_transform(df_side['Label'])

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        # Normalize features
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Clustering
        hdb = HDBSCAN(min_cluster_size=10)
        print(f"Training shape: {X_train.shape}")
        y_pred = hdb.fit_predict(X_train)
        print(f"Cluster labels: {np.unique(y_pred)}")

        for c in np.unique(y_pred):
            cluster_indices = np.where(y_pred == c)[0]
            print(f"Cluster {c}: {cluster_indices.shape[0]} samples")

            # Cluster label breakdown
            cluster_labels = y_train[cluster_indices]
            unique_elements, counts = np.unique(cluster_labels, return_counts=True)
            for i in range(len(unique_elements)):
                label_name = le.inverse_transform([unique_elements[i]])[0]
                proportion = counts[i] / np.sum(counts)
                print(f"  {label_name}: {proportion:.2f}")


=== 3D Halves - Feature Set: No convex hull features at all ===
Feature count: 8
['frac_dim_mean', 'C1_mean', 'C2_mean', 'C3_mean', 'C4_mean', 'C5_mean', 'alpha_d_mean', 'alpha_j_mean']

--- Side: CT ---
Training shape: (543, 8)
Cluster labels: [-1]
Cluster -1: 543 samples
  de_ancient_CT: 0.13
  de_dust2_CT: 0.11
  de_inferno_CT: 0.19
  de_mirage_CT: 0.19
  de_nuke_CT: 0.16
  de_overpass_CT: 0.10
  de_vertigo_CT: 0.12

--- Side: T ---
Training shape: (544, 8)
Cluster labels: [-1]
Cluster -1: 544 samples
  de_ancient_T: 0.11
  de_dust2_T: 0.10
  de_inferno_T: 0.20
  de_mirage_T: 0.20
  de_nuke_T: 0.17
  de_overpass_T: 0.11
  de_vertigo_T: 0.11

NO SIDE SPLITTING VERSION
Training shape: (1087, 8)
Cluster labels: [-1]
Cluster -1: 1087 samples
  de_ancient_CT: 0.06
  de_ancient_T: 0.06
  de_dust2_CT: 0.06
  de_dust2_T: 0.05
  de_inferno_CT: 0.09
  de_inferno_T: 0.10
  de_mirage_CT: 0.09
  de_mirage_T: 0.10
  de_nuke_CT: 0.09
  de_nuke_T: 0.08
  de_overpass_CT: 0.05
  de_overpass_T: 0.05
