In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler
# For time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet

# For machine learning
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
import lightgbm as lgb

In [10]:
# Load the preprocessed data
df = pd.read_csv('data/preprocessed_tollplaza_data.csv')
df['initiated_time'] = pd.to_datetime(df['initiated_time'])
df['time_interval'] = pd.to_datetime(df['time_interval'])
print(df['initiated_time'])
print("Data loaded successfully. Shape:", df.shape)

0        2024-03-19 21:54:00
1        2024-03-19 18:59:00
2        2024-03-19 00:14:00
3        2024-03-19 18:19:00
4        2024-03-19 15:03:00
                 ...        
367069   2024-03-19 07:31:00
367070   2024-03-19 12:55:00
367071   2024-03-19 13:11:00
367072   2024-03-19 17:19:00
367073   2024-03-19 17:28:00
Name: initiated_time, Length: 367074, dtype: datetime64[ns]
Data loaded successfully. Shape: (367074, 40)


In [11]:

# 1. TRAFFIC PATTERN ANALYSIS
print("\n1. TRAFFIC PATTERN ANALYSIS")
print("-" * 50)

# Aggregate data by time intervals
def create_time_series_data(df, interval='15min'):
    """Create time series data at specified intervals."""
    ts_data = df.set_index('initiated_time').resample(interval).agg({
        'txn_amount': 'sum',
        'SlNo.': 'count',  # Count of transactions
        'inn_rr_time_sec': 'mean'  # Average processing time
    }).reset_index()
    
    ts_data.rename(columns={'SlNo.': 'transaction_count'}, inplace=True)
    return ts_data

# Create 15-minute interval data
ts_15min = create_time_series_data(df, '15min')
print("Time series data created at 15-minute intervals:")
print(ts_15min.head())

# Create hourly data
ts_hourly = create_time_series_data(df, '1H')
print("\nTime series data created at hourly intervals:")
print(ts_hourly.head())

# Visualize traffic patterns
plt.figure(figsize=(14, 7))
plt.plot(ts_15min['initiated_time'], ts_15min['transaction_count'])
plt.title('Traffic Volume Over Time (15-minute intervals)')
plt.xlabel('Time')
plt.ylabel('Number of Transactions')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('images/traffic_pattern_15min.png')
plt.close()

# Decompose time series to analyze trend, seasonality, and residuals
def analyze_time_series_components(ts_data, column='transaction_count'):
    """Decompose time series into trend, seasonality, and residual components."""
    # Set the index to datetime for decomposition
    ts = ts_data.set_index('initiated_time')[column]
    
    # Decompose the time series
    decomposition = seasonal_decompose(ts, model='additive', period=12)  # 24 periods for hourly data
    
    # Plot the decomposition
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(14, 12))
    decomposition.observed.plot(ax=ax1)
    ax1.set_title('Observed')
    decomposition.trend.plot(ax=ax2)
    ax2.set_title('Trend')
    decomposition.seasonal.plot(ax=ax3)
    ax3.set_title('Seasonality')
    decomposition.resid.plot(ax=ax4)
    ax4.set_title('Residuals')
    plt.tight_layout()
    plt.savefig('images/time_series_decomposition.png')
    plt.close()
    
    return decomposition

# Analyze hourly traffic patterns
print("\nAnalyzing hourly traffic patterns...")
decomposition = analyze_time_series_components(ts_hourly)



1. TRAFFIC PATTERN ANALYSIS
--------------------------------------------------
Time series data created at 15-minute intervals:
       initiated_time  txn_amount  transaction_count  inn_rr_time_sec
0 2024-03-19 00:00:00      266256               2469      1882.420008
1 2024-03-19 00:15:00      281348               2518       958.048848
2 2024-03-19 00:30:00      233571               2091       486.887135
3 2024-03-19 00:45:00      237070               2015      1060.671464
4 2024-03-19 01:00:00      245842               2097      2214.508822

Time series data created at hourly intervals:
       initiated_time  txn_amount  transaction_count  inn_rr_time_sec
0 2024-03-19 00:00:00     1018245               9093      1123.435170
1 2024-03-19 01:00:00      917015               7692      1478.759880
2 2024-03-19 02:00:00      790608               6569       913.394885
3 2024-03-19 03:00:00      755165               6123       417.210681
4 2024-03-19 04:00:00      805013               6792  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# For time series analysis and models
from statsmodels.tsa.seasonal import seasonal_decompose
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

# For machine learning pipelines and evaluation
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer

import logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')

# -------------------------------
# 2. IMPROVED PREDICTIVE MODELS DEVELOPMENT
# -------------------------------
logging.info("2. IMPROVED PREDICTIVE MODELS DEVELOPMENT")

# 2.1 Traffic Volume Prediction Model
logging.info("2.1 Traffic Volume Prediction Model")

# -------------------------------
# 2.1.1 Enhanced Feature preparation
# -------------------------------
def prepare_features_for_prediction(df):
    """
    Prepare features for traffic prediction models with enhanced feature engineering.
    Adds time-based features, cyclical encodings, and multiple lag features and rolling statistics.
    """
    features = df.copy()
    
    # Time-based features
    features['hour'] = features['initiated_time'].dt.hour
    features['day_of_week'] = features['initiated_time'].dt.dayofweek
    features['is_weekend'] = features['day_of_week'].isin([5, 6]).astype(int)
    features['quarter_of_day'] = features['hour'] // 6
    features['day_part'] = pd.cut(features['hour'], 
                                bins=[0, 6, 12, 18, 24], 
                                labels=['night', 'morning', 'afternoon', 'evening'])
    features['day_part'] = features['day_part'].astype('category').cat.codes
    
    # Create cyclical time features
    features['hour_sin'] = np.sin(2 * np.pi * features['hour'] / 24)
    features['hour_cos'] = np.cos(2 * np.pi * features['hour'] / 24)
    features['day_sin'] = np.sin(2 * np.pi * features['day_of_week'] / 7)
    features['day_cos'] = np.cos(2 * np.pi * features['day_of_week'] / 7)
    
    # Add lag features with more historical context
    for lag in range(1, 6):  # Use 5 lag periods
        features[f'lag_{lag}'] = features['transaction_count'].shift(lag)
    
    # Add rolling statistics
    features['rolling_mean_3'] = features['transaction_count'].rolling(window=3).mean()
    features['rolling_std_3'] = features['transaction_count'].rolling(window=3).std()
    features['rolling_mean_6'] = features['transaction_count'].rolling(window=6).mean()
    features['rolling_max_6'] = features['transaction_count'].rolling(window=6).max()
    features['rolling_min_6'] = features['transaction_count'].rolling(window=6).min()
    
    # Calculate transaction differences
    features['tx_diff_1'] = features['transaction_count'].diff(1)
    features['tx_diff_2'] = features['transaction_count'].diff(2)
    
    # Add transaction rate features
    if 'inn_rr_time_sec' in features.columns:
        features['tx_per_second'] = features['transaction_count'] / features['inn_rr_time_sec']
        features['tx_per_second'].fillna(features['tx_per_second'].mean(), inplace=True)
    
    # Add average transaction amount per transaction
    if 'txn_amount' in features.columns:
        features['avg_amount_per_tx'] = features['txn_amount'] / features['transaction_count']
        features['avg_amount_per_tx'].fillna(features['avg_amount_per_tx'].mean(), inplace=True)
    
    # Drop NaN values created by lag and rolling features
    features.dropna(inplace=True)
    
    # Debug: output basic stats of target variable
    logging.debug("Target variable stats after enhanced feature engineering: mean=%.2f, std=%.2f", 
                  features['transaction_count'].mean(), features['transaction_count'].std())
    
    return features

# Assume ts_15min is your 15-minute aggregated DataFrame that already includes 'transaction_count'
ts_features = prepare_features_for_prediction(ts_15min)
logging.debug("Enhanced features shape: %s", ts_features.shape)


# -------------------------------
# 2.1.2 Improved data splitting and feature selection
# -------------------------------
def select_features_and_split(features_df):

    # Select features with different types for separate preprocessing
    time_cyclical_features = ['hour_sin', 'hour_cos', 'day_sin', 'day_cos']
    categorical_features = ['is_weekend', 'quarter_of_day', 'day_part']
    numerical_features = ['hour', 'day_of_week']
    
    # Get lag and rolling features dynamically
    lag_features = [col for col in features_df.columns if col.startswith('lag_')]
    rolling_features = [col for col in features_df.columns if col.startswith('rolling_')]
    diff_features = [col for col in features_df.columns if col.startswith('tx_diff')]
    
    # Additional features if available
    additional_features = []
    if 'tx_per_second' in features_df.columns:
        additional_features.append('tx_per_second')
    if 'avg_amount_per_tx' in features_df.columns:
        additional_features.append('avg_amount_per_tx')
    
    # Combine all features
    all_features = time_cyclical_features + categorical_features + numerical_features + \
                  lag_features + rolling_features + diff_features + additional_features
    
    X = features_df[all_features]
    y = features_df['transaction_count']
    
    # Return features categorized for proper preprocessing
    feature_groups = {
        'time_cyclical': time_cyclical_features,
        'categorical': categorical_features,
        'numerical': numerical_features,
        'lag': lag_features,
        'rolling': rolling_features,
        'diff': diff_features,
        'additional': additional_features
    }
    
    return X, y, feature_groups

X, y, feature_groups = select_features_and_split(ts_features)
logging.debug("Selected features: %s", X.columns.tolist())

# -------------------------------
# 2.1.3 Improved Model Training & Evaluation
# -------------------------------
def create_preprocessor(feature_groups):
    """
    Create a column transformer to appropriately preprocess different feature types
    """
    transformers = [
        ('time_cyclical', 'passthrough', feature_groups['time_cyclical']),
        ('categorical', 'passthrough', feature_groups['categorical']),
        ('numerical', StandardScaler(), feature_groups['numerical']),
        ('lag', RobustScaler(), feature_groups['lag']),
        ('rolling', RobustScaler(), feature_groups['rolling']),
        ('diff', RobustScaler(), feature_groups['diff'])
    ]
    
    # Add additional features if they exist
    if feature_groups['additional']:
        transformers.append(('additional', RobustScaler(), feature_groups['additional']))
    
    preprocessor = ColumnTransformer(transformers, remainder='drop')
    return preprocessor

def train_traffic_prediction_model(X, y, feature_groups):

    # Create preprocessor
    preprocessor = create_preprocessor(feature_groups)
    
    # Define time series split for proper evaluation
    tscv = TimeSeriesSplit(n_splits=3, test_size=12)
    
    # Define candidate models
    models = {
        'XGBoost': XGBRegressor(
            n_estimators=200, 
            learning_rate=0.05, 
            max_depth=4,
            subsample=0.8,
            colsample_bytree=0.8,
            gamma=1,
            reg_lambda=2,
            random_state=42
        ),
        'RandomForest': RandomForestRegressor(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=4,
            random_state=42
        ),
        'LightGBM': LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=4,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=2,
            random_state=42
        )
    }
    
    best_model = None
    best_score = float('-inf')
    best_metrics = None
    
    # Train and evaluate each model
    for name, model_instance in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model_instance)
        ])
        
        # Use cross-validation to evaluate model
        cv_scores = []
        mae_scores = []
        rmse_scores = []
        r2_scores = []
        
        logging.info(f"Evaluating {name} with time series cross-validation...")
        
        for train_idx, test_idx in tscv.split(X):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            mae_scores.append(mae)
            rmse_scores.append(rmse)
            r2_scores.append(r2)
        
        # Average scores across folds
        avg_mae = np.mean(mae_scores)
        avg_rmse = np.mean(rmse_scores)
        avg_r2 = np.mean(r2_scores)
        
        logging.info(f"{name} - Average MAE: {avg_mae:.2f}, RMSE: {avg_rmse:.2f}, R²: {avg_r2:.4f}")
        
        # Update best model if this one is better
        if avg_r2 > best_score:
            best_score = avg_r2
            best_model = pipeline
            best_metrics = {
                'name': name,
                'mae': avg_mae,
                'rmse': avg_rmse,
                'r2': avg_r2
            }
    
    # Train the best model on all data
    logging.info(f"Best model: {best_metrics['name']} with R²: {best_metrics['r2']:.4f}")
    best_model.fit(X, y)
    
    # Get feature importance if the model supports it
    if hasattr(best_model['regressor'], 'feature_importances_'):
        feature_names = X.columns
        importances = best_model['regressor'].feature_importances_
        sorted_idx = np.argsort(importances)[::-1]
        
        plt.figure(figsize=(12, 8))
        plt.barh(range(len(sorted_idx)), importances[sorted_idx])
        plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.savefig('images/feature_importance.png')
        plt.close()
    
    # Plot actual vs predicted for the last fold
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    y_pred = best_model.predict(X_test)
    
    plt.figure(figsize=(14, 7))
    plt.plot(y_test.values, label='Actual', marker='o')
    plt.plot(y_pred, label='Predicted', marker='x')
    plt.title('Traffic Volume: Actual vs Predicted')
    plt.xlabel('Time Points')
    plt.ylabel('Transaction Count')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('images/traffic_prediction_results.png')
    plt.close()
    
    return best_model, best_metrics

logging.info("Training improved traffic prediction model...")
traffic_model, metrics = train_traffic_prediction_model(X, y, feature_groups)
logging.info(f"Final Traffic Prediction metrics - MAE: {metrics['mae']:.2f}, RMSE: {metrics['rmse']:.2f}, R²: {metrics['r2']:.4f}")

# -------------------------------
# 2.2 Vehicle Class Distribution Prediction
# -------------------------------
logging.info("2.2 Improved Vehicle Class Distribution Prediction")

def create_vehicle_class_timeseries(df, interval='1H'):
    """
    Create time series data of vehicle class distribution.
    Returns absolute counts (vc_ts) and percentage distribution (vc_ts_pct).
    """
    vc_ts = df.set_index('initiated_time').groupby([pd.Grouper(freq=interval), 'vehicle_class_code']).size().unstack(fill_value=0)
    vc_ts_pct = vc_ts.div(vc_ts.sum(axis=1), axis=0) * 100
    return vc_ts, vc_ts_pct

# Create vehicle class time series
vc_hourly, vc_hourly_pct = create_vehicle_class_timeseries(df, '1H')
logging.debug("Vehicle class distribution (percentage) shape: %s", vc_hourly_pct.shape)

# Plot vehicle class distribution
plt.figure(figsize=(14, 8))
vc_hourly_pct.plot(kind='area', stacked=True, colormap='viridis')
plt.title('Vehicle Class Distribution Over Time')
plt.xlabel('Time')
plt.ylabel('Percentage of Vehicles')
plt.legend(title='Vehicle Class', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('images/vehicle_class_distribution.png')
plt.close()

# -------------------------------
# 2.2.1 Vehicle Class Prediction
# -------------------------------
def train_vehicle_class_prediction_model(vc_hourly):

    vc_features = vc_hourly.reset_index()
    
    # Enhanced time features
    vc_features['hour'] = vc_features['initiated_time'].dt.hour
    vc_features['day_of_week'] = vc_features['initiated_time'].dt.dayofweek
    vc_features['is_weekend'] = vc_features['day_of_week'].isin([5, 6]).astype(int)
    vc_features['day_part'] = pd.cut(vc_features['hour'], 
                                      bins=[0, 6, 12, 18, 24], 
                                      labels=['night', 'morning', 'afternoon', 'evening'])
    vc_features['day_part'] = vc_features['day_part'].astype('category').cat.codes
    
    # Cyclical features
    vc_features['hour_sin'] = np.sin(2 * np.pi * vc_features['hour'] / 24)
    vc_features['hour_cos'] = np.cos(2 * np.pi * vc_features['hour'] / 24)
    vc_features['day_sin'] = np.sin(2 * np.pi * vc_features['day_of_week'] / 7)
    vc_features['day_cos'] = np.cos(2 * np.pi * vc_features['day_of_week'] / 7)
    
    # Feature selection
    X_cols = ['hour', 'day_of_week', 'is_weekend', 'day_part',
              'hour_sin', 'hour_cos', 'day_sin', 'day_cos']
    X = vc_features[X_cols]
    y = vc_hourly.values  # each column corresponds to a vehicle class
    
    # Create column transformer for different feature types
    preprocessor = ColumnTransformer([
        ('cyclical', 'passthrough', ['hour_sin', 'hour_cos', 'day_sin', 'day_cos']),
        ('numerical', StandardScaler(), ['hour', 'day_of_week']),
        ('categorical', 'passthrough', ['is_weekend', 'day_part'])
    ])
    
    # Create time series split for proper validation
    tscv = TimeSeriesSplit(n_splits=3, test_size=6)
    
    # Train and evaluate the model
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=200, 
                                           max_depth=8, 
                                           min_samples_split=5, 
                                           random_state=42))
    ])
    
    # Use cross-validation to evaluate model
    mae_scores = []
    rmse_scores = []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        mae_scores.append(mae)
        rmse_scores.append(rmse)
    
    # Average scores across folds
    avg_mae = np.mean(mae_scores)
    avg_rmse = np.mean(rmse_scores)
    
    logging.info(f"Vehicle Class Prediction - Average MAE: {avg_mae:.2f}, RMSE: {avg_rmse:.2f}")
    
    # Train final model on all data
    model.fit(X, y)
    
    # Get a test prediction to evaluate
    last_date = vc_features['initiated_time'].max()
    next_hour = last_date + pd.Timedelta(hours=1)
    
    # Create feature vector for next hour prediction
    next_x = pd.DataFrame({
        'hour': [next_hour.hour],
        'day_of_week': [next_hour.dayofweek],
        'is_weekend': [1 if next_hour.dayofweek >= 5 else 0],
        'day_part': [next_hour.hour // 6],
        'hour_sin': [np.sin(2 * np.pi * next_hour.hour / 24)],
        'hour_cos': [np.cos(2 * np.pi * next_hour.hour / 24)],
        'day_sin': [np.sin(2 * np.pi * next_hour.dayofweek / 7)],
        'day_cos': [np.cos(2 * np.pi * next_hour.dayofweek / 7)]
    })
    
    # Predict next hour's distribution
    next_distribution = model.predict(next_x)[0]  # Get first element to make it 2D
    predicted_distribution = pd.DataFrame([next_distribution], columns=vc_hourly.columns)
    
    logging.debug("Predicted vehicle class distribution for next hour:\n%s", 
                  predicted_distribution.to_string())
    
    # Visualize predictions vs actuals for last test fold
    last_train_idx, last_test_idx = list(tscv.split(X))[-1]
    X_train, X_test = X.iloc[last_train_idx], X.iloc[last_test_idx]
    y_train, y_test = y[last_train_idx], y[last_test_idx]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Convert to DataFrames for easier plotting
    y_test_df = pd.DataFrame(y_test, columns=vc_hourly.columns)
    y_pred_df = pd.DataFrame(y_pred, columns=vc_hourly.columns)
    
    # Select top 5 vehicle classes by volume for visualization
    top_classes = vc_hourly.sum().sort_values(ascending=False).head(5).index.tolist()
    top_classes2 = vc_hourly.sum().sort_values(ascending=False).index.tolist()
    print("siochsiohchsioc",top_classes2)
    # Plot actual vs predicted for top classes
    plt.figure(figsize=(16, 10))
    for i, vc in enumerate(top_classes):
        plt.subplot(len(top_classes), 1, i+1)
        plt.plot(y_test_df.index, y_test_df[vc], 'b-', label=f'Actual {vc}')
        plt.plot(y_test_df.index, y_pred_df[vc], 'r--', label=f'Predicted {vc}')
        plt.ylabel('Count')
        plt.legend()
        if i == 0:
            plt.title('Top Vehicle Classes: Actual vs. Predicted')
        if i == len(top_classes) - 1:
            plt.xlabel('Time Index')
    
    plt.tight_layout()
    plt.savefig('images/vehicle_class_prediction_results.png')
    plt.close()
    
    return model, avg_mae, avg_rmse

logging.info("Training improved vehicle class prediction model...")
if not vc_hourly.empty:
    vc_model, vc_mae, vc_rmse = train_vehicle_class_prediction_model(vc_hourly)
else:
    logging.warning("No vehicle class data available for modeling")

INFO: 2. IMPROVED PREDICTIVE MODELS DEVELOPMENT
INFO: 2.1 Traffic Volume Prediction Model
DEBUG: Target variable stats after enhanced feature engineering: mean=3910.81, std=1215.02
DEBUG: Enhanced features shape: (91, 27)
DEBUG: Selected features: ['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'is_weekend', 'quarter_of_day', 'day_part', 'hour', 'day_of_week', 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'rolling_mean_3', 'rolling_std_3', 'rolling_mean_6', 'rolling_max_6', 'rolling_min_6', 'tx_diff_1', 'tx_diff_2', 'tx_per_second', 'avg_amount_per_tx']
INFO: Training improved traffic prediction model...
INFO: Evaluating XGBoost with time series cross-validation...
INFO: XGBoost - Average MAE: 272.28, RMSE: 323.88, R²: 0.0312
INFO: Evaluating RandomForest with time series cross-validation...
INFO: RandomForest - Average MAE: 223.56, RMSE: 288.83, R²: 0.0826
INFO: Evaluating LightGBM with time series cross-validation...
INFO: LightGBM - Average MAE: 293.91, RMSE: 370.25, R²: -0.5658
INFO:

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 300
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 18
[LightGBM] [Info] Start training from score 3476.400000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362
[LightGBM] [Info] Number of data points in the train set: 67, number of used features: 19
[LightGBM] [Info] Start training from score 3751.567164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 423
[LightGBM] [Info] Number of data points in the train set: 7

INFO: Final Traffic Prediction metrics - MAE: 223.56, RMSE: 288.83, R²: 0.0826
INFO: 2.2 Improved Vehicle Class Distribution Prediction
DEBUG: Vehicle class distribution (percentage) shape: (24, 14)
INFO: Training improved vehicle class prediction model...
INFO: Vehicle Class Prediction - Average MAE: 345.82, RMSE: 1089.15
DEBUG: Predicted vehicle class distribution for next hour:
vehicle_class_code        VC10        VC11        VC12        VC13       VC14      VC15  VC16        VC20         VC4         VC5  VC6         VC7        VC8        VC9
0                   643.255756  345.888879  617.227506  383.952339  94.918687  2.343327   0.0  593.235591  4354.61447  491.061954  0.0  463.054702  24.959446  27.226877


<Figure size 1400x800 with 0 Axes>

In [13]:
# 3. ANOMALY DETECTION SYSTEM
print("\n3. ANOMALY DETECTION SYSTEM")
print("-" * 50)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# 3.1 Traffic Pattern Anomaly Detection
print("\n3.1 Traffic Pattern Anomaly Detection")

def build_traffic_anomaly_detector(ts_data):
    """Build an anomaly detection model for traffic patterns using multiple methods."""
    # Prepare features
    features = ts_data.copy()
    features['hour'] = features['initiated_time'].dt.hour
    features['day_of_week'] = features['initiated_time'].dt.dayofweek
    features['is_weekend'] = features['day_of_week'].isin([5, 6]).astype(int)
    features['month'] = features['initiated_time'].dt.month
    features['day'] = features['initiated_time'].dt.day
    
    # Features for anomaly detection
    X = features[['transaction_count', 'hour', 'day_of_week', 'is_weekend', 'month', 'day']]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize dictionary to store models and results
    models = {}
    
    # 1. Isolation Forest model
    print("\nTraining Isolation Forest model...")
    if_model = IsolationForest(
        contamination=0.05,  # Expected proportion of anomalies
        n_estimators=200,    # Increased number of trees
        max_samples='auto',
        random_state=42
    )
    
    if_model.fit(X_scaled)
    
    # Predict anomalies
    features['if_anomaly'] = if_model.predict(X_scaled)
    features['if_anomaly_score'] = if_model.score_samples(X_scaled)
    features['if_is_anomaly'] = features['if_anomaly'].apply(lambda x: 1 if x == -1 else 0)
    
    # Store model in dictionary
    models['isolation_forest'] = (if_model, features[features['if_is_anomaly'] == 1])
    
    # 2. DBSCAN clustering for anomaly detection
    print("Training DBSCAN model...")
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    features['dbscan_cluster'] = dbscan.fit_predict(X_scaled)
    
    # Mark outliers (cluster -1) as anomalies
    features['dbscan_is_anomaly'] = (features['dbscan_cluster'] == -1).astype(int)
    
    # Store model in dictionary
    models['dbscan'] = (dbscan, features[features['dbscan_is_anomaly'] == 1])
    
    # Create ensemble anomaly detection (combine results from both methods)
    features['ensemble_score'] = features['if_is_anomaly'] + features['dbscan_is_anomaly']
    features['ensemble_is_anomaly'] = (features['ensemble_score'] > 0).astype(int)
    
    # Count anomalies for each method
    if_anomaly_count = features['if_is_anomaly'].sum()
    dbscan_anomaly_count = features['dbscan_is_anomaly'].sum()
    ensemble_anomaly_count = features['ensemble_is_anomaly'].sum()
    
    print(f"Isolation Forest: Detected {if_anomaly_count} anomalies out of {len(features)} intervals")
    print(f"DBSCAN: Detected {dbscan_anomaly_count} anomalies out of {len(features)} intervals")
    print(f"Ensemble: Detected {ensemble_anomaly_count} anomalies out of {len(features)} intervals")
    
    # Visualize anomalies for each method
    plt.figure(figsize=(18, 12))
    
    # Isolation Forest
    plt.subplot(2, 2, 1)
    plt.scatter(features['initiated_time'], features['transaction_count'], 
                c=features['if_is_anomaly'], cmap='coolwarm', alpha=0.6)
    plt.title('Isolation Forest Anomaly Detection')
    plt.xlabel('Time')
    plt.ylabel('Transaction Count')
    plt.colorbar(label='Anomaly (1) / Normal (0)')
    
    # DBSCAN
    plt.subplot(2, 2, 2)
    plt.scatter(features['initiated_time'], features['transaction_count'], 
                c=features['dbscan_is_anomaly'], cmap='coolwarm', alpha=0.6)
    plt.title('DBSCAN Anomaly Detection')
    plt.xlabel('Time')
    plt.ylabel('Transaction Count')
    plt.colorbar(label='Anomaly (1) / Normal (0)')
    
    # Ensemble
    plt.subplot(2, 2, 3)
    plt.scatter(features['initiated_time'], features['transaction_count'], 
                c=features['ensemble_is_anomaly'], cmap='coolwarm', alpha=0.6)
    plt.title('Ensemble Anomaly Detection')
    plt.xlabel('Time')
    plt.ylabel('Transaction Count')
    plt.colorbar(label='Anomaly (1) / Normal (0)')
    
    plt.tight_layout()
    plt.savefig('images/traffic_anomalies_comparison.png')
    plt.close()
    
    # Return models and anomalies
    ensemble_anomalies = features[features['ensemble_is_anomaly'] == 1]
    return models, scaler, ensemble_anomalies, features

# Build traffic anomaly detector
print("\nBuilding traffic anomaly detection models...")
anomaly_models, anomaly_scaler, anomalies, features_with_anomalies = build_traffic_anomaly_detector(ts_15min)

# Show the detected anomalies
print("\nTop 10 detected traffic anomalies:")
print(anomalies[['initiated_time', 'transaction_count']].sort_values('transaction_count', ascending=False).head(10))

# 3.2 Toll Skipping Detection
print("\n3.2 Toll Skipping Detection")

def detect_potential_toll_skipping(df):
    """Detect potential toll skipping patterns."""
    # Print columns for debugging
    print("Available columns in dataframe:")
    print(df.columns.tolist())
    
    # Check if the vehicle registration column exists
    vehicle_col = 'vehicle_regn_number'
    
    if vehicle_col not in df.columns:
        # Try to find alternatives
        possible_columns = [col for col in df.columns if any(term in col.lower() for term in 
                           ['vehicle', 'regn', 'registration', 'reg', 'tag', 'transponder', 'id'])]
        
        if possible_columns:
            vehicle_col = possible_columns[0]
            print(f"Using alternative column: '{vehicle_col}'")
        else:
            print("No suitable vehicle identifier found. Creating synthetic ID.")
            df['synthetic_vehicle_id'] = range(1, len(df) + 1)
            vehicle_col = 'synthetic_vehicle_id'
    
    print(f"Using '{vehicle_col}' as vehicle identifier")
    
    # Group by vehicle identifier
    print(f"Grouping by {vehicle_col}...")
    vehicle_trips = df.groupby(vehicle_col).agg({
        'initiated_time': ['min', 'max', 'count'],
        'merchant_name': 'nunique',
        'txn_amount': 'sum'
    })
    
    vehicle_trips.columns = ['first_seen', 'last_seen', 'total_trips', 'unique_plazas', 'total_amount']
    
    # Calculate time difference between first and last trip
    vehicle_trips['time_span_hours'] = (vehicle_trips['last_seen'] - vehicle_trips['first_seen']).dt.total_seconds() / 3600
    
    # Calculate average transactions per hour active
    vehicle_trips['trips_per_hour'] = vehicle_trips['total_trips'] / vehicle_trips['time_span_hours'].clip(lower=1)
    
    # Calculate average amount per trip
    vehicle_trips['avg_amount_per_trip'] = vehicle_trips['total_amount'] / vehicle_trips['total_trips']
    
    # Calculate ratio of unique plazas to total trips
    vehicle_trips['plaza_to_trip_ratio'] = vehicle_trips['unique_plazas'] / vehicle_trips['total_trips']
    
    # Filter for vehicles with multiple trips
    multi_trip_vehicles = vehicle_trips[vehicle_trips['total_trips'] > 1].copy()
    
    models = {}
    
    if len(multi_trip_vehicles) > 0:
        print(f"Found {len(multi_trip_vehicles)} vehicles with multiple trips")
        
        # Features for clustering
        features = multi_trip_vehicles[['total_trips', 'unique_plazas', 'trips_per_hour', 
                                       'avg_amount_per_trip', 'plaza_to_trip_ratio']]
        
        # Scale features
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(features)
        
        # Try multiple clustering algorithms
        
        # 1. K-Means clustering
        print("\nPerforming K-Means clustering...")
        # Determine optimal number of clusters
        inertia = []
        k_range = range(1, min(11, len(features_scaled)))
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(features_scaled)
            inertia.append(kmeans.inertia_)
        
        # Plot elbow curve
        plt.figure(figsize=(10, 6))
        plt.plot(k_range, inertia, 'o-')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Inertia')
        plt.title('Elbow Method for Optimal k')
        plt.grid(True)
        plt.savefig('images/kmeans_elbow_curve.png')
        plt.close()
        
        # Choose optimal k (simplified method)
        optimal_k = 3
        if len(inertia) > 2:
            for i in range(1, len(inertia)-1):
                if (inertia[i-1] - inertia[i]) > 3 * (inertia[i] - inertia[i+1]):
                    optimal_k = i+1
                    break
        
        print(f"Optimal number of clusters determined: {optimal_k}")
        
        # Perform KMeans clustering
        kmeans = KMeans(n_clusters=optimal_k, random_state=42)
        multi_trip_vehicles['kmeans_cluster'] = kmeans.fit_predict(features_scaled)
        
        # Store model
        models['kmeans'] = kmeans
        
        # 2. DBSCAN clustering
        print("Performing DBSCAN clustering...")
        dbscan = DBSCAN(eps=0.5, min_samples=5)
        multi_trip_vehicles['dbscan_cluster'] = dbscan.fit_predict(features_scaled)
        
        # Store model
        models['dbscan'] = dbscan
        
        # Analyze KMeans clusters - FIXED: use count() instead of vehicle_col
        kmeans_cluster_stats = multi_trip_vehicles.groupby('kmeans_cluster').agg({
            'total_trips': 'mean',
            'unique_plazas': 'mean',
            'trips_per_hour': 'mean',
            'avg_amount_per_trip': 'mean',
            'plaza_to_trip_ratio': 'mean',
            'total_amount': 'count'  # Just count the rows instead of using the vehicle column
        })
        
        # Rename the count column to 'vehicle_count'
        kmeans_cluster_stats = kmeans_cluster_stats.rename(columns={'total_amount': 'vehicle_count'})
        
        print("\nK-Means Cluster statistics:")
        print(kmeans_cluster_stats)
        
        # Identify potential toll skipping clusters
        toll_skip_score = kmeans_cluster_stats['plaza_to_trip_ratio'] 
        potential_skip_cluster = toll_skip_score.idxmin()
        
        print(f"\nPotential toll skipping K-Means cluster identified: Cluster {potential_skip_cluster}")
        print(f"This cluster has {kmeans_cluster_stats.loc[potential_skip_cluster, 'vehicle_count']} vehicles")
        
        # Get vehicles in the potential toll skipping cluster from KMeans
        kmeans_skip_vehicles = multi_trip_vehicles[multi_trip_vehicles['kmeans_cluster'] == potential_skip_cluster]
        
        # Analyze DBSCAN outliers (cluster -1)
        if -1 in multi_trip_vehicles['dbscan_cluster'].unique():
            dbscan_outliers = multi_trip_vehicles[multi_trip_vehicles['dbscan_cluster'] == -1]
            print(f"\nDBSCAN identified {len(dbscan_outliers)} potential outlier vehicles")
            
            # Combine results from both methods
            combined_suspects = pd.Index(list(set(kmeans_skip_vehicles.index) | set(dbscan_outliers.index)))
            print(f"\nCombined unique suspect vehicles from both methods: {len(combined_suspects)}")
            
            # Create a combined anomaly score
            multi_trip_vehicles['combined_score'] = 0
            multi_trip_vehicles.loc[multi_trip_vehicles['kmeans_cluster'] == potential_skip_cluster, 'combined_score'] += 1
            multi_trip_vehicles.loc[multi_trip_vehicles['dbscan_cluster'] == -1, 'combined_score'] += 1
            
            # Get the most suspicious vehicles (detected by both methods)
            high_confidence_suspects = multi_trip_vehicles[multi_trip_vehicles['combined_score'] > 1]
            print(f"High confidence suspects (detected by both methods): {len(high_confidence_suspects)}")
            
            return high_confidence_suspects, models, scaler
        else:
            print("DBSCAN did not identify any outliers")
            return kmeans_skip_vehicles, models, scaler
    else:
        print("Not enough multi-trip vehicles to perform clustering")
        return None, None, None

# Detect potential toll skipping
print("\nDetecting potential toll skipping patterns...")
potential_skip_vehicles, skip_models, skip_scaler = detect_potential_toll_skipping(df)

if potential_skip_vehicles is not None and len(potential_skip_vehicles) > 0:
    print("\nSample of potential toll skipping vehicles:")
    print(potential_skip_vehicles.head())
    
    # Create a vehicle behavior classifier
    print("\n3.3 Building supervised vehicle behavior classifier")
    
    # Use our detected vehicles to create a synthetic labeled dataset
    print("Creating synthetic labeled dataset for demonstration...")
    
    # Extract features from all multi-trip vehicles - use the index directly
    all_vehicles = potential_skip_vehicles.copy()
    
    # Create more "normal" vehicles by slightly modifying the suspicious ones
    normal_vehicles = potential_skip_vehicles.copy()
    # Make them less suspicious by increasing plaza_to_trip_ratio
    normal_vehicles['plaza_to_trip_ratio'] = normal_vehicles['plaza_to_trip_ratio'] * 1.5
    normal_vehicles.index = [f"normal_{idx}" for idx in normal_vehicles.index]
    
    all_vehicles = pd.concat([all_vehicles, normal_vehicles])
    
    # Create labels (1 for suspicious, 0 for normal)
    all_vehicles['label'] = 0
    all_vehicles.loc[potential_skip_vehicles.index, 'label'] = 1
    
    # Extract features
    X = all_vehicles[['total_trips', 'unique_plazas', 'trips_per_hour', 
                     'avg_amount_per_trip', 'plaza_to_trip_ratio']]
    y = all_vehicles['label']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Train a RandomForest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)
    
    # Evaluate the classifier
    y_pred = rf_classifier.predict(X_test)
    
    print("\nVehicle behavior classifier metrics:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_classifier.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature importance for detecting suspicious vehicles:")
    print(feature_importance)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance for Suspicious Vehicle Detection')
    plt.tight_layout()
    plt.savefig('images/feature_importance.png')
    plt.close()
    
    # Save the classifier
    vehicle_classifier = rf_classifier
else:
    print("No potential toll skipping vehicles detected. Skipping classifier creation.")

# 3.4 Enhanced Traffic Pattern Analysis
print("\n3.4 Enhanced Traffic Pattern Analysis")

def analyze_traffic_patterns_with_anomalies(features):
    """Analyze traffic patterns incorporating anomaly information."""
    # Group by hour of day
    hourly_patterns = features.groupby('hour').agg({
        'transaction_count': 'mean',
        'ensemble_is_anomaly': 'mean'  # Gives the proportion of anomalies by hour
    }).reset_index()
    
    # Group by day of week
    daily_patterns = features.groupby('day_of_week').agg({
        'transaction_count': 'mean',
        'ensemble_is_anomaly': 'mean'  # Gives the proportion of anomalies by day
    }).reset_index()
    
    # Visualize hourly patterns
    plt.figure(figsize=(18, 10))
    
    plt.subplot(2, 2, 1)
    plt.bar(hourly_patterns['hour'], hourly_patterns['transaction_count'])
    plt.title('Average Transactions by Hour of Day')
    plt.xlabel('Hour of Day')
    plt.ylabel('Average Transaction Count')
    plt.xticks(range(0, 24, 2))
    plt.grid(True, alpha=0.3)
    
    plt.subplot(2, 2, 2)
    plt.bar(hourly_patterns['hour'], hourly_patterns['ensemble_is_anomaly'] * 100)
    plt.title('Percentage of Anomalies by Hour of Day')
    plt.xlabel('Hour of Day')
    plt.ylabel('Anomaly Percentage')
    plt.xticks(range(0, 24, 2))
    plt.grid(True, alpha=0.3)
    
    plt.subplot(2, 2, 3)
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    plt.bar(days, daily_patterns['transaction_count'])
    plt.title('Average Transactions by Day of Week')
    plt.xlabel('Day of Week')
    plt.ylabel('Average Transaction Count')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    plt.subplot(2, 2, 4)
    plt.bar(days, daily_patterns['ensemble_is_anomaly'] * 100)
    plt.title('Percentage of Anomalies by Day of Week')
    plt.xlabel('Day of Week')
    plt.ylabel('Anomaly Percentage')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('images/traffic_pattern_analysis.png')
    plt.close()
    
    # Find the most anomalous hours and days
    most_anomalous_hours = hourly_patterns.sort_values('ensemble_is_anomaly', ascending=False)[['hour', 'ensemble_is_anomaly']].head(3)
    most_anomalous_days = daily_patterns.sort_values('ensemble_is_anomaly', ascending=False)[['day_of_week', 'ensemble_is_anomaly']].head(3)
    
    print("\nMost anomalous hours of the day:")
    for _, row in most_anomalous_hours.iterrows():
        print(f"Hour {int(row['hour'])}: {row['ensemble_is_anomaly']*100:.2f}% anomalies")
    
    print("\nMost anomalous days of the week:")
    day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
    for _, row in most_anomalous_days.iterrows():
        day = day_names[row['day_of_week']]
        print(f"{day}: {row['ensemble_is_anomaly']*100:.2f}% anomalies")
    
    return hourly_patterns, daily_patterns

# Analyze traffic patterns including anomaly information
if 'features_with_anomalies' in locals():
    print("\nAnalyzing traffic patterns with anomaly information...")
    hourly_patterns, daily_patterns = analyze_traffic_patterns_with_anomalies(features_with_anomalies)

# 4. SAVE MODELS AND RESULTS
print("\n4. SAVING MODELS AND RESULTS")
print("-" * 50)

# Save traffic prediction model
if 'traffic_model' in locals():
    joblib.dump(traffic_model, 'models/traffic_prediction_model.pkl')
    print("Traffic prediction model saved as 'traffic_prediction_model.pkl'")

# Save anomaly detection models
if 'anomaly_models' in locals():
    joblib.dump((anomaly_models, anomaly_scaler), 'models/traffic_anomaly_models.pkl')
    print("Traffic anomaly detection models saved as 'traffic_anomaly_models.pkl'")

# Save vehicle class prediction model if available
if 'vc_model' in locals():
    joblib.dump(vc_model, 'models/vehicle_class_prediction_model.pkl')
    print("Vehicle class prediction model saved as 'vehicle_class_prediction_model.pkl'")

# Save toll skipping detection models if available
if 'skip_models' in locals() and skip_models is not None:
    joblib.dump((skip_models, skip_scaler), 'models/toll_skipping_models.pkl')
    print("Toll skipping detection models saved as 'toll_skipping_models.pkl'")

# Save vehicle behavior classifier if available
if 'vehicle_classifier' in locals():
    joblib.dump(vehicle_classifier, 'models/vehicle_behavior_classifier.pkl')
    print("Vehicle behavior classifier saved as 'vehicle_behavior_classifier.pkl'")

# Save the anomalies dataframe
if 'anomalies' in locals() and len(anomalies) > 0:
    anomalies.to_csv('data/detected_anomalies.csv', index=False)
    print("Detected anomalies saved as 'detected_anomalies.csv'")

# Save potential toll skipping vehicles
if 'potential_skip_vehicles' in locals() and potential_skip_vehicles is not None:
    potential_skip_vehicles.to_csv('data/potential_toll_skipping_vehicles.csv')
    print("Potential toll skipping vehicles saved as 'potential_toll_skipping_vehicles.csv'")

# Save traffic pattern analysis results
if 'hourly_patterns' in locals() and 'daily_patterns' in locals():
    hourly_patterns.to_csv('data/hourly_traffic_patterns.csv', index=False)
    daily_patterns.to_csv('data/daily_traffic_patterns.csv', index=False)
    print("Traffic pattern analysis results saved as CSV files")

print("\nPhase 2: Enhanced Analytics Models complete!")


3. ANOMALY DETECTION SYSTEM
--------------------------------------------------

3.1 Traffic Pattern Anomaly Detection

Building traffic anomaly detection models...

Training Isolation Forest model...


DEBUG: locator: <matplotlib.ticker.AutoLocator object at 0x0000019F850A1190>
DEBUG: colorbar update normal <matplotlib.colors.Normalize object at 0x0000019F843844A0> <matplotlib.colors.Normalize object at 0x0000019F843844A0>
DEBUG: locator: <matplotlib.ticker.AutoLocator object at 0x0000019F923C3110>
DEBUG: colorbar update normal <matplotlib.colors.Normalize object at 0x0000019F843844A0> <matplotlib.colors.Normalize object at 0x0000019F843844A0>
DEBUG: locator: <matplotlib.ticker.AutoLocator object at 0x0000019F923C3110>
DEBUG: locator: <matplotlib.ticker.AutoLocator object at 0x0000019F856E0290>
DEBUG: locator: <matplotlib.ticker.AutoLocator object at 0x0000019F8DDD3110>


Training DBSCAN model...
Isolation Forest: Detected 5 anomalies out of 96 intervals
DBSCAN: Detected 0 anomalies out of 96 intervals
Ensemble: Detected 5 anomalies out of 96 intervals

Top 10 detected traffic anomalies:
        initiated_time  transaction_count
73 2024-03-19 18:15:00               5598
61 2024-03-19 15:15:00               5582
87 2024-03-19 21:45:00               3514
1  2024-03-19 00:15:00               2518
0  2024-03-19 00:00:00               2469

3.2 Toll Skipping Detection

Detecting potential toll skipping patterns...
Available columns in dataframe:
['SlNo.', 'merchant_name', 'direction', 'lane', 'tag_id', 'vehicle_regn_number', 'txn_amount', 'initiated_time', 'inn_rr_time_sec', 'vehicle_class_code', 'vehicle_comvehicle', 'geocode', 'merchant_sub_type', 'city', 'state', 'hour', 'day_of_week', 'txn_amount_scaled', 'vehicle_class_code_enc', 'merchant_name_enc', 'minute', 'time_of_day', 'latitude', 'longitude', 'merchant_name_encoded', 'direction_encoded', 'lane_en

INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.



Most anomalous hours of the day:
Hour 0: 50.00% anomalies
Hour 21: 25.00% anomalies
Hour 18: 25.00% anomalies

Most anomalous days of the week:
Tuesday: 5.21% anomalies

4. SAVING MODELS AND RESULTS
--------------------------------------------------
Traffic prediction model saved as 'traffic_prediction_model.pkl'
Traffic anomaly detection models saved as 'traffic_anomaly_models.pkl'
Vehicle class prediction model saved as 'vehicle_class_prediction_model.pkl'
Toll skipping detection models saved as 'toll_skipping_models.pkl'
Vehicle behavior classifier saved as 'vehicle_behavior_classifier.pkl'
Detected anomalies saved as 'detected_anomalies.csv'
Potential toll skipping vehicles saved as 'potential_toll_skipping_vehicles.csv'
Traffic pattern analysis results saved as CSV files

Phase 2: Enhanced Analytics Models complete!
