In [7]:
# Import necessary libraries for data processing, clustering, and classification
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from typing import List, Tuple
from sklearn.cluster import DBSCAN, KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# --- Updated Data Loading Function ---
def load_trajectories(video_folder: str) -> Tuple[List[pd.DataFrame], List[int]]:
    """
    Load trajectory data from CSV files in 'normal' and 'abnormal' subfolders within each numbered directory.

    Args:
        video_folder (str): Path to the parent folder containing numbered directories (e.g., '10', '11', '12').

    Returns:
        Tuple[List[pd.DataFrame], List[int]]: List of trajectory DataFrames and their labels (0: normal, 1: abnormal).
    """
    trajectories, labels = [], []

    for root, dirs, _ in os.walk(video_folder):
        if 'normal' in dirs and 'abnormal' in dirs:
            numbered_dir = os.path.basename(root)
            print(f"Processing directory: {numbered_dir}")

            # Normal trajectories
            normal_path = os.path.join(root, 'normal')
            normal_files = [os.path.join(normal_path, f) for f in os.listdir(normal_path) if f.endswith('.csv')]
            print(f"Found {len(normal_files)} normal CSV files")
            for file in normal_files:
                df = pd.read_csv(file)[['frameNo', 'left', 'top', 'w', 'h']].sort_values(by='frameNo')
                trajectories.append(df)
                labels.append(0)

            # Abnormal trajectories
            abnormal_path = os.path.join(root, 'abnormal')
            abnormal_files = [os.path.join(abnormal_path, f) for f in os.listdir(abnormal_path) if f.endswith('.csv')]
            print(f"Found {len(abnormal_files)} abnormal CSV files")
            for file in abnormal_files:
                df = pd.read_csv(file)[['frameNo', 'left', 'top', 'w', 'h']].sort_values(by='frameNo')
                trajectories.append(df)
                labels.append(1)
    return trajectories, labels

# --- Helper Functions for Feature Extraction ---
def df_to_points(df: pd.DataFrame) -> List[Tuple[int, float, float]]:
    """Convert DataFrame to list of (frame_id, x, y) points using bounding box center."""
    return [(int(row['frameNo']), row['left'] + row['w'] / 2, row['top'] + row['h'] / 2) for _, row in df.iterrows()]

def calculate_speed(points: List[Tuple[int, float, float]], fps: float) -> np.ndarray:
    """Calculate speed between consecutive points."""
    dx = np.diff([p[1] for p in points])
    dy = np.diff([p[2] for p in points])
    dt = np.diff([p[0] for p in points]) / fps
    distances = np.sqrt(dx**2 + dy**2)
    return distances / dt

def calculate_trajectory_smoothness(points: List[Tuple[int, float, float]]) -> float:
    """Calculate smoothness as variance of curvature (angle changes)."""
    dx = np.diff([p[1] for p in points])
    dy = np.diff([p[2] for p in points])
    angles = np.arctan2(dy, dx)
    return np.var(np.diff(angles)) if len(angles) > 1 else 0.0

def calculate_direction_consistency(points: List[Tuple[int, float, float]]) -> float:
    """Measure total angle change to detect wrong-side or U-turns."""
    dx = np.diff([p[1] for p in points])
    dy = np.diff([p[2] for p in points])
    angles = np.arctan2(dy, dx)
    total_change = np.sum(np.abs(np.diff(angles)))
    return total_change if len(angles) > 1 else 0.0

def derive_roundabout_geometry(normal_trajectories: List[pd.DataFrame]) -> Tuple[float, float, float]:
    """Derive roundabout center and radius from normal trajectories."""
    all_points = [p for traj_df in normal_trajectories for p in df_to_points(traj_df)]
    if not all_points:
        return 0.0, 0.0, 100.0  # Default values
    points_array = np.array([(p[1], p[2]) for p in all_points])
    center_x, center_y = np.mean(points_array, axis=0)
    distances = np.sqrt((points_array[:, 0] - center_x)**2 + (points_array[:, 1] - center_y)**2)
    radius = np.median(distances)
    return center_x, center_y, radius

def calculate_roundabout_traversal(points: List[Tuple[int, float, float]], center_x: float, center_y: float, radius: float) -> int:
    """Binary flag: 1 if trajectory passes through roundabout, 0 otherwise."""
    distances = np.sqrt((np.array([p[1] for p in points]) - center_x)**2 + (np.array([p[2] for p in points]) - center_y)**2)
    return 1 if np.any(distances < radius) else 0

def calculate_path_efficiency(points: List[Tuple[int, float, float]]) -> float:
    """Calculate efficiency as straight-line distance divided by total path length."""
    total_distance = np.sum(np.sqrt(np.diff([p[1] for p in points])**2 + np.diff([p[2] for p in points])**2))
    start_end_distance = np.sqrt((points[-1][1] - points[0][1])**2 + (points[-1][2] - points[0][2])**2)
    return start_end_distance / total_distance if total_distance > 0 else 1.0

# --- Feature Extraction ---
def extract_features(trajectories: List[pd.DataFrame], labels: List[int], fps: float) -> pd.DataFrame:
    """Extract refined features from trajectories."""
    normal_trajectories = [traj for traj, label in zip(trajectories, labels) if label == 0]
    center_x, center_y, radius = derive_roundabout_geometry(normal_trajectories)

    results = []
    for idx, (traj_df, label) in enumerate(zip(trajectories, labels)):
        points = df_to_points(traj_df)
        speeds = calculate_speed(points, fps)

        features = {
            'track_id': idx + 1,
            'label': label,
            'speed_variance': np.var(speeds) if len(speeds) > 0 else 0.0,
            'trajectory_smoothness': calculate_trajectory_smoothness(points),
            'direction_consistency': calculate_direction_consistency(points),
            'roundabout_traversal': calculate_roundabout_traversal(points, center_x, center_y, radius),
            'path_efficiency': calculate_path_efficiency(points),
        }
        results.append(features)

    return pd.DataFrame(results)

# --- Data Preprocessing and Cleaning ---
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Clean DataFrame by replacing NaN/infinite values with 0."""
    return df.replace([np.inf, -np.inf], np.nan).fillna(0)

# --- Main Execution ---
if __name__ == "__main__":
    # Set path to dataset
    video_folder = "/home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed"
    print(f"Using video_folder: {video_folder}")

    # Load trajectories
    trajectories, labels = load_trajectories(video_folder)
    print(f"Loaded {len(trajectories)} trajectories.")

    if len(trajectories) == 0:
        print("No trajectories loaded. Check directory structure.")
    else:
        # Extract features
        fps = 1.0  # Adjust based on video frame rate
        features_df = extract_features(trajectories, labels, fps)
        print("Feature extraction complete. Sample data:")
        print(features_df.head())

        # Clean data
        cleaned_df = clean_dataframe(features_df)

        # Prepare feature matrix and labels
        numeric_features = [
            'speed_variance', 'trajectory_smoothness', 'direction_consistency',
            'roundabout_traversal', 'path_efficiency'
        ]
        X = cleaned_df[numeric_features]
        y = cleaned_df['label']
        print(f"Prepared {len(X)} samples with {len(numeric_features)} features.")

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Step 1: Apply DBSCAN for initial anomaly detection
        db = DBSCAN(eps=0.5, min_samples=5).fit(X_scaled)
        cluster_labels = db.labels_
        initial_preds = [1 if label == -1 else 0 for label in cluster_labels]

        # Step 2: Train XGBoost on labeled data
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
        clf = xgb.XGBClassifier(random_state=42, scale_pos_weight=sum(y == 0) / sum(y == 1), use_label_encoder=False, eval_metric='logloss')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Evaluate results
        print("\nHybrid Classification Results (XGBoost):")
        print(classification_report(y_test, y_pred))
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

        # Display feature importances
        print("\nFeature Importances:")
        importances = pd.DataFrame({'Feature': numeric_features, 'Importance': clf.feature_importances_})
        print(importances.sort_values(by='Importance', ascending=False))

Using video_folder: /home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed
Processing directory: 10
Found 101 normal CSV files
Found 58 abnormal CSV files
Processing directory: 11
Found 248 normal CSV files
Found 103 abnormal CSV files
Processing directory: 12
Found 206 normal CSV files
Found 70 abnormal CSV files
Loaded 786 trajectories.


  return distances / dt
  x = asanyarray(arr - arrmean)
  return distances / dt


Feature extraction complete. Sample data:
   track_id  label  speed_variance  trajectory_smoothness  \
0         1      0        0.265442               6.992715   
1         2      0       11.990207              16.192243   
2         3      0        2.742187               0.217069   
3         4      0        7.259035              13.449146   
4         5      0       17.242700              15.200677   

   direction_consistency  roundabout_traversal  path_efficiency  
0              10.493970                     0         0.245147  
1              87.664052                     0         0.988577  
2              23.343119                     1         0.991076  
3             546.752129                     1         0.993253  
4             662.820302                     1         0.983496  
Prepared 786 samples with 5 features.


Parameters: { "use_label_encoder" } are not used.




Hybrid Classification Results (XGBoost):
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       119
           1       0.48      0.51      0.49        39

    accuracy                           0.74       158
   macro avg       0.66      0.66      0.66       158
weighted avg       0.75      0.74      0.74       158

Accuracy: 0.74

Feature Importances:
                 Feature  Importance
3   roundabout_traversal    0.363404
4        path_efficiency    0.250001
2  direction_consistency    0.136798
0         speed_variance    0.129912
1  trajectory_smoothness    0.119885
