In [1]:
import pandas as pd
import numpy as np
from typing import List, Optional, Dict, Tuple
from collections import deque, defaultdict
from joblib import Parallel, delayed
import logging
import time
from tqdm import tqdm
import math

DEFAULT_VOLUME_FEATURES = [
    'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std',
    'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std',
    'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
    'Fwd Header Length', 'Bwd Header Length',
    'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
    'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size',
    'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk',
    'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk',
    'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
    'Init_Win_bytes_forward', 'Init_Win_bytes_backward',
    'act_data_pkt_fwd', 'min_seg_size_forward',
    'Flow Bytes/s', 'Flow Packets/s', 'Fwd Packets/s', 'Bwd Packets/s',
    'Fwd Avg Bulk Rate', 'Bwd Avg Bulk Rate'
]

DEFAULT_TEMPORAL_FEATURES = [
    'Flow Duration',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
    'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
    'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
    'Active Mean', 'Active Std', 'Active Max', 'Active Min',
    'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min',
    'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
    'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count',
]

DEFAULT_ENTROPY_FEATURES = [
    'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol'
]

class RollingStats:
    def __init__(self, window: int):
        self.window = window
        self.values = deque()

    def update(self, value: float):
        self.values.append(value)
        if len(self.values) > self.window:
            self.values.popleft()

    def compute(self) -> Tuple[float, float, float, float, float, float]:
        arr = np.array(self.values)
        if len(arr) == 0:
            return 0, 1e-9, 0, 1e-9, 0, 0
        mean = np.mean(arr)
        std = np.std(arr) or 1e-9
        med = np.median(arr)
        mad = np.median(np.abs(arr - med)) or 1e-9
        q1 = np.percentile(arr, 25)
        q3 = np.percentile(arr, 75)
        return mean, std, med, mad, q1, q3

class EntropyTracker:
    def __init__(self, window: int):
        self.window = window
        self.history = deque(maxlen=window)
        self.value_counts = defaultdict(int)
        self.total = 0
        
    def update(self, value: str):
        if len(self.history) == self.window:
            old_val = self.history[0]
            self.value_counts[old_val] -= 1
            if self.value_counts[old_val] <= 0:
                del self.value_counts[old_val]
            self.total -= 1
            
        self.history.append(value)
        self.value_counts[value] += 1
        self.total += 1
        
    def compute_entropy(self) -> float:
        if self.total == 0:
            return 0.0
            
        entropy = 0.0
        for count in self.value_counts.values():
            probability = count / self.total
            entropy -= probability * math.log(probability, 2)
            
        max_entropy = math.log(len(self.value_counts), 2) if len(self.value_counts) > 0 else 1.0
        normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0
        
        return normalized_entropy

class AnomalyScorer:
    def __init__(self,
                 window: int = 100,
                 volume_features: Optional[List[str]] = None,
                 temporal_features: Optional[List[str]] = None,
                 entropy_features: Optional[List[str]] = None,
                 keep_details: bool = False,
                 log_level: str = 'INFO',
                 n_jobs: int = -1):
        self.window = window
        self.volume_features = volume_features or DEFAULT_VOLUME_FEATURES
        self.temporal_features = temporal_features or DEFAULT_TEMPORAL_FEATURES
        self.entropy_features = entropy_features or DEFAULT_ENTROPY_FEATURES
        self.keep_details = keep_details
        self.n_jobs = n_jobs
        self._setup_logging(log_level)
        self.stats = {}
        self.entropy_trackers = {}
        self.top_contributors = {}

    def _setup_logging(self, log_level: str):
        self.logger = logging.getLogger('AnomalyScorer')
        self.logger.setLevel(log_level)
        if not self.logger.handlers:
            ch = logging.StreamHandler()
            formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
            ch.setFormatter(formatter)
            self.logger.addHandler(ch)

    def _init_stats(self, features: List[str]):
        for f in features:
            if f not in self.stats:
                self.stats[f] = RollingStats(self.window)

    def _init_entropy_trackers(self, features: List[str]):
        for f in features:
            if f not in self.entropy_trackers:
                self.entropy_trackers[f] = EntropyTracker(self.window)

    def _compute_feature_scores(self, df: pd.DataFrame, features: List[str], directional: bool = False) -> Dict[str, pd.Series]:
        self._init_stats(features)
        scores = {f: [] for f in features}
    
        for i in tqdm(range(len(df)), desc="Scoring features"):
            for f in features:
                val = df.iloc[i][f]
                self.stats[f].update(val)
                mean, std, med, mad, q1, q3 = self.stats[f].compute()
    
                if directional:
                    z = max(0, min(1, (val - mean) / (3 * std) + 0.5))  # Scaled to 0-1 range
                    mad_score = max(0, min(1, (val - med) / (3 * mad) + 0.5))
                else:
                    z = min(1, abs((val - mean) / (3 * std)))  # Capped at 1 (3 sigma)
                    mad_score = min(1, abs((val - med) / (3 * mad)))
    
                iqr_outlier = float(val < q1 - 1.5 * (q3 - q1) or val > q3 + 1.5 * (q3 - q1))
                score = 0.3 * z + 0.3 * mad_score + 0.4 * iqr_outlier
                scores[f].append(min(1, max(0, score)))  # Ensure score is between 0-1
    
        return {f: pd.Series(scores[f], index=df.index) for f in features}

    def _compute_entropy_scores(self, df: pd.DataFrame, features: List[str]) -> Dict[str, pd.Series]:
        self._init_entropy_trackers(features)
        scores = {f: [] for f in features}
        
        for i in tqdm(range(len(df)), desc="Computing entropy scores"):
            for f in features:
                val = str(df.iloc[i][f])
                self.entropy_trackers[f].update(val)
                entropy = self.entropy_trackers[f].compute_entropy()
                
                # Low entropy is more anomalous (common values dominate)
                entropy_score = 1.0 - entropy
                scores[f].append(min(1, max(0, entropy_score)))
                
        return {f: pd.Series(scores[f], index=df.index) for f in features}

    def compute_scores(self, df: pd.DataFrame) -> pd.DataFrame:
        start_time = time.time()
        self.logger.info("Starting anomaly score computation...")

        df_proc = df.copy()
        vol_feats = [f for f in self.volume_features if f in df_proc.columns]
        temp_feats = [f for f in self.temporal_features if f in df_proc.columns]
        entropy_feats = [f for f in self.entropy_features if f in df_proc.columns]

        # Compute all scores
        vol_scores = self._compute_feature_scores(df_proc, vol_feats, directional=True)
        temp_scores = self._compute_feature_scores(df_proc, temp_feats, directional=False)
        entropy_scores = self._compute_entropy_scores(df_proc, entropy_feats)

        if self.keep_details:
            # Store individual feature scores if requested
            for f, s in {**vol_scores, **temp_scores, **entropy_scores}.items():
                df_proc[f'AS_{f}'] = s

        # Combine scores using standard deviation (normalized to 0-1 range)
        df_proc['VOLUME_ANOMALY_SCORE'] = 0.7 * pd.DataFrame(vol_scores).sum(axis=1) + 0.3 * pd.DataFrame(vol_scores).max(axis=1)
        df_proc['TEMPORAL_ANOMALY_SCORE'] = 0.7 * pd.DataFrame(temp_scores).sum(axis=1) + 0.3 * pd.DataFrame(temp_scores).max(axis=1)
        df_proc['ENTROPY_ANOMALY_SCORE'] = 0.7 * pd.DataFrame(entropy_scores).sum(axis=1) + 0.3 * pd.DataFrame(entropy_scores).max(axis=1)

        # Store top contributors for each category
        self._store_top_contributors(vol_scores, 'VOLUME')
        self._store_top_contributors(temp_scores, 'TEMPORAL')
        self._store_top_contributors(entropy_scores, 'ENTROPY')

        elapsed = time.time() - start_time
        self.logger.info(f"Anomaly scoring completed in {elapsed:.2f} seconds")

        return df_proc

    def _store_top_contributors(self, scores: Dict[str, pd.Series], label: str):
        score_df = pd.DataFrame(scores)
        self.top_contributors[f'{label}_TOP3'] = score_df.apply(
            lambda row: row.nlargest(3).index.tolist(), axis=1)

In [None]:
csv_path = "input_data11.csv"       # edit path
df   = pd.read_csv(csv_path,low_memory=False)
df= df[:10000]

In [None]:
# df1
# import ipaddress

# # Convert IPv4 address to int
# df1[' Source IP'] = df[' Source IP'].apply(lambda x: int(ipaddress.IPv4Address(x)))
# df1[' Destination IP'] = df[' Destination IP'].apply(lambda x: int(ipaddress.IPv4Address(x)))


In [None]:
# --- load your CIC Flow-Meter CSV --------------------------------
# csv_path = "datasets/UDPLag.csv/UDPLag.csv"       # edit path
# df   = pd.read_csv(csv_path)
# df=df[:4000]
df=df
df.columns =df.columns.str.strip()
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Replace NaNs with column mean or 0
df.fillna(df.mean(), inplace=True)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# # flow_features= df.drop(columns=["VOLUME_ANOMALY_SCORE","TEMPORAL_ANOMALY_SCORE","ENTROPY_ANOMALY_SCORE"])
# flow_features.replace([np.inf, -np.inf], np.nan, inplace=True)

# # Replace NaNs with column mean or 0
# flow_features.fillna(flow_features.mean(), inplace=True)

# # Fit the scaler on training data and transform both train and test
# data = scaler.fit_transform(flow_features)
# df2 = pd.DataFrame(data,columns=flow_features.columns)


In [None]:

# Initialize with parallel processing (uses all cores)
scorer = AnomalyScorer(
    window=1000,
    keep_details=False,
    
    log_level='INFO',
    n_jobs=1
)

# Load your data
# df_raw = pd.read_csv('your_data.csv')

# Process data
result_df= scorer.compute_scores(df)
# Access results
# normalized_volume = result_df['NORM_VOLUME_SCORE']
# normalized_temporal = result_df['NORM_TEMPORAL_SCORE']
# overall_score = result_df['OVERALL_SCORE']

# # Get top contributing features
# print(scorer.top_contributors['VOLUME_TOP3'].tail())  # Recent volume contributors
# print(scorer.top_contributors['TEMPORAL_TOP3'].tail()) # Recent temporal contributors

# # Access top contributors
# print("Top volume contributors:", scorer.top_contributors['VOLUME_TOP3'].head())
# print("Top temporal contributors:", scorer.top_contributors['TEMPORAL_TOP3'].head())

In [None]:
result_df.to_csv("test_df11.csv",index=False)