In [3]:
# Import necessary libraries for data processing, clustering, and classification
import os
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.cluster import DBSCAN, KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# --- Updated Data Loading Function ---
def load_trajectories(video_folder: str) -> Tuple[List[pd.DataFrame], List[int]]:
    """
    Load trajectory data from CSV files in 'normal' and 'abnormal' subfolders within each numbered directory.

    Args:
        video_folder (str): Path to the parent folder containing numbered directories (e.g., '10', '11', '12').

    Returns:
        Tuple[List[pd.DataFrame], List[int]]: List of trajectory DataFrames and their labels (0: normal, 1: abnormal).
    """
    trajectories, labels = [], []

    # Walk through the video_folder to find all numbered directories
    for root, dirs, files in os.walk(video_folder):
        # Check if the current directory has 'normal' and 'abnormal' subfolders
        if 'normal' in dirs and 'abnormal' in dirs:
            numbered_dir = os.path.basename(root)
            print(f"Processing directory: {numbered_dir}")

            # Load from 'normal' folder
            normal_path = os.path.join(root, 'normal')
            normal_files = [os.path.join(normal_path, f) for f in os.listdir(normal_path) if f.endswith('.csv')]
            print(f"Found {len(normal_files)} normal CSV files in {normal_path}")
            for file in normal_files:
                df = pd.read_csv(file)[['frameNo', 'left', 'top', 'w', 'h']].sort_values(by='frameNo')
                trajectories.append(df)
                labels.append(0)  # Label 0 for normal

            # Load from 'abnormal' folder
            abnormal_path = os.path.join(root, 'abnormal')
            abnormal_files = [os.path.join(abnormal_path, f) for f in os.listdir(abnormal_path) if f.endswith('.csv')]
            print(f"Found {len(abnormal_files)} abnormal CSV files in {abnormal_path}")
            for file in abnormal_files:
                df = pd.read_csv(file)[['frameNo', 'left', 'top', 'w', 'h']].sort_values(by='frameNo')
                trajectories.append(df)
                labels.append(1)  # Label 1 for abnormal

    return trajectories, labels

# --- Helper Functions for Feature Extraction ---
def df_to_points(df: pd.DataFrame) -> List[Tuple[int, float, float]]:
    """Convert DataFrame to a list of (frame_id, x, y) points, using center of bounding box."""
    points = []
    for _, row in df.iterrows():
        frame_id = int(row['frameNo'])
        x = row['left'] + row['w'] / 2
        y = row['top'] + row['h'] / 2
        points.append((frame_id, x, y))
    return points

def calculate_speed(points: List[Tuple[int, float, float]], fps: float) -> np.ndarray:
    """Calculate speed between consecutive points."""
    dx = np.diff([p[1] for p in points])
    dy = np.diff([p[2] for p in points])
    dt = np.diff([p[0] for p in points]) / fps
    distances = np.sqrt(dx**2 + dy**2)
    return distances / dt

def calculate_acceleration(points: List[Tuple[int, float, float]], fps: float) -> np.ndarray:
    """Calculate acceleration based on speed changes."""
    speeds = calculate_speed(points, fps)
    dt = np.diff([p[0] for p in points][:-1]) / fps  # Adjust for speed array length
    return np.diff(speeds) / dt

def calculate_direction_variance(points: List[Tuple[int, float, float]]) -> float:
    """Calculate variance in direction (angle) between consecutive points."""
    dx = np.diff([p[1] for p in points])
    dy = np.diff([p[2] for p in points])
    angles = np.arctan2(dy, dx)
    return np.var(np.diff(angles)) if len(angles) > 1 else 0.0

def derive_roundabout_geometry(normal_trajectories: List[pd.DataFrame]) -> Tuple[float, float, float]:
    """Derive roundabout center and radius from normal trajectories using KMeans."""
    all_points = []
    for traj_df in normal_trajectories:
        points = df_to_points(traj_df)
        all_points.extend([(p[1], p[2]) for p in points])
    if not all_points:
        return 0.0, 0.0, 100.0  # Default values
    points_array = np.array(all_points)
    kmeans = KMeans(n_clusters=1).fit(points_array)
    center_x, center_y = kmeans.cluster_centers_[0]
    distances = np.sqrt((points_array[:, 0] - center_x)**2 + (points_array[:, 1] - center_y)**2)
    radius = np.median(distances)
    return center_x, center_y, radius

def calculate_circular_adherence(points: List[Tuple[int, float, float]], center_x: float, center_y: float, radius: float) -> float:
    """Measure how closely the trajectory adheres to a circular path."""
    distances = np.sqrt((np.array([p[1] for p in points]) - center_x)**2 + (np.array([p[2] for p in points]) - center_y)**2)
    return 1.0 - np.mean(np.abs(distances - radius)) / radius if len(distances) > 0 else 0.0

def calculate_path_efficiency(points: List[Tuple[int, float, float]]) -> float:
    """Calculate efficiency as straight-line distance divided by total path length."""
    total_distance = np.sum(np.sqrt(np.diff([p[1] for p in points])**2 + np.diff([p[2] for p in points])**2))
    start_end_distance = np.sqrt((points[-1][1] - points[0][1])**2 + (points[-1][2] - points[0][2])**2)
    return start_end_distance / total_distance if total_distance > 0 else 1.0

def calculate_number_of_sharp_turns(points: List[Tuple[int, float, float]], threshold: float = np.pi/4) -> int:
    """Count sharp turns based on angle changes exceeding a threshold."""
    dx = np.diff([p[1] for p in points])
    dy = np.diff([p[2] for p in points])
    angles = np.arctan2(dy, dx)
    angle_changes = np.abs(np.diff(angles))
    return np.sum(angle_changes > threshold) if len(angle_changes) > 0 else 0

def calculate_zone_transition_count(points: List[Tuple[int, float, float]], center_x: float, center_y: float, radius: float) -> int:
    """Count transitions across a simplified zone boundary (inside/outside circle)."""
    distances = np.sqrt((np.array([p[1] for p in points]) - center_x)**2 + (np.array([p[2] for p in points]) - center_y)**2)
    inside = distances < radius
    return np.sum(np.diff(inside.astype(int)) != 0) if len(inside) > 1 else 0

def calculate_forbidden_transitions(points: List[Tuple[int, float, float]], center_x: float, center_y: float, radius: float) -> int:
    """Count forbidden transitions (simplified as erratic jumps across zones)."""
    distances = np.sqrt((np.array([p[1] for p in points]) - center_x)**2 + (np.array([p[2] for p in points]) - center_y)**2)
    zones = (distances < radius).astype(int)  # 0: outside, 1: inside
    transitions = np.diff(zones)
    return np.sum(transitions != 0) if len(transitions) > 0 else 0

# --- Feature Extraction ---
def extract_features(trajectories: List[pd.DataFrame], labels: List[int], fps: float) -> pd.DataFrame:
    """Extract features from a list of trajectories."""
    normal_trajectories = [traj for traj, label in zip(trajectories, labels) if label == 0]
    center_x, center_y, radius = derive_roundabout_geometry(normal_trajectories)

    results = []
    for idx, (traj_df, label) in enumerate(zip(trajectories, labels)):
        points = df_to_points(traj_df)
        speeds = calculate_speed(points, fps)
        accels = calculate_acceleration(points, fps)

        features = {
            'track_id': idx + 1,
            'label': label,
            'speed_variance': np.var(speeds) if len(speeds) > 0 else 0.0,
            'max_acceleration': np.max(np.abs(accels)) if len(accels) > 0 else 0.0,
            'direction_variance': calculate_direction_variance(points),
            'circular_adherence': calculate_circular_adherence(points, center_x, center_y, radius),
            'path_efficiency': calculate_path_efficiency(points),
            'number_of_sharp_turns': calculate_number_of_sharp_turns(points),
            'zone_transition_count': calculate_zone_transition_count(points, center_x, center_y, radius),
            'forbidden_transitions': calculate_forbidden_transitions(points, center_x, center_y, radius)
        }
        results.append(features)

    return pd.DataFrame(results)

# --- Data Preprocessing and Cleaning ---
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Clean the DataFrame by replacing NaN/infinite values with 0."""
    return df.replace([np.inf, -np.inf], np.nan).fillna(0)

# --- Main Execution ---
if __name__ == "__main__":
    # Set the path to the parent folder containing numbered directories
    video_folder = "/home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed"
    print(f"Using video_folder: {video_folder}")

    # Load trajectories from all numbered directories
    trajectories, labels = load_trajectories(video_folder)
    print(f"Loaded {len(trajectories)} trajectories.")

    if len(trajectories) == 0:
        print("No trajectories loaded. Check the directory structure and file paths.")
    else:
        # Extract features
        fps = 1.0  # Adjust based on your video's frame rate
        features_df = extract_features(trajectories, labels, fps)
        print("Feature extraction complete. Sample data:")
        print(features_df.head())

        # Clean data
        cleaned_df = clean_dataframe(features_df)

        # Prepare feature matrix and labels
        numeric_features = [
            'speed_variance', 'max_acceleration', 'direction_variance', 'circular_adherence',
            'path_efficiency', 'number_of_sharp_turns', 'zone_transition_count', 'forbidden_transitions'
        ]
        X = cleaned_df[numeric_features]
        y = cleaned_df['label']
        print(f"Prepared {len(X)} samples with {len(numeric_features)} features.")

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Step 1: Apply DBSCAN for initial anomaly detection
        db = DBSCAN(eps=0.5, min_samples=5).fit(X_scaled)
        cluster_labels = db.labels_
        initial_preds = [1 if label == -1 else 0 for label in cluster_labels]  # -1: outlier (abnormal)

        # Step 2: Train Random Forest on labeled data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        clf = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Evaluate results
        print("\nHybrid Classification Results (Random Forest):")
        print(classification_report(y_test, y_pred))
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

        # Display feature importances
        print("\nFeature Importances:")
        importances = pd.DataFrame({'Feature': numeric_features, 'Importance': clf.feature_importances_})
        print(importances.sort_values(by='Importance', ascending=False))

Using video_folder: /home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed
Processing directory: 10
Found 101 normal CSV files in /home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed/10/normal
Found 58 abnormal CSV files in /home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed/10/abnormal
Processing directory: 11
Found 248 normal CSV files in /home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed/11/normal
Found 103 abnormal CSV files in /home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed/11/abnormal
Processing directory: 12
Found 206 normal CSV files in /home/run/media/localdiskD/Ahmedabad University/6th SEM/ML/ML_2025_4_Cluster_555/dataset(Copy)/processed/12/normal
Found 70 abnormal CSV files in /home/run/media/localdiskD/Ahmedabad 

  return distances / dt
  x = asanyarray(arr - arrmean)
  return distances / dt
  a = op(a[slice1], a[slice2])


Feature extraction complete. Sample data:
   track_id  label  speed_variance  max_acceleration  direction_variance  \
0         1      0        0.265442          0.914214            6.992715   
1         2      0       11.990207         13.500064           16.192243   
2         3      0        2.742187          7.433034            0.217069   
3         4      0        7.259035          8.997568           13.449146   
4         5      0       17.242700         42.074009           15.200677   

   circular_adherence  path_efficiency  number_of_sharp_turns  \
0            0.818975         0.245147                      4   
1            0.216107         0.988577                     15   
2            0.704500         0.991076                      4   
3            0.581755         0.993253                     87   
4            0.471818         0.983496                    109   

   zone_transition_count  forbidden_transitions  
0                      0                      0  
1         