In [1]:
from glob import glob
from tqdm import tqdm
import networkx as nx
from natsort import natsorted
import json
import uuid
import os
import pickle
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import re
from collections import defaultdict, namedtuple

from graphutil import *

GRAPH_PATH = "Graphs"

In [None]:
# Predefined attack sequence patterns based on exact operation names
SequencePattern = namedtuple('SequencePattern', ['name', 'operations', 'color', 'description', 'min_length', 'strict_order', 'results'])


ATTACK_SEQUENCE_PATTERNS = [
    # Process creation patterns
    SequencePattern(
        name="Process_Creation",
        operations=["Process Create"],
        color="#FF3333",  # Red
        description="Process creation operations",
        min_length=1,
        strict_order=False,
        results=["SUCCESS"]  # Expected results for this pattern
    ),

    # File manipulation patterns
    SequencePattern(
        name="File_Creation_Write",
        operations=["CreateFile", "WriteFile"],
        color="#FF6600",  # Orange
        description="File creation followed by write operations",
        min_length=2,
        strict_order=True,
        results=["SUCCESS", "SUCCESS"]  
    ),
    SequencePattern(
        name="File_Creation_Metadata_Write",
        operations=["CreateFile", "QueryBasicInformationFile", "WriteFile", "CloseFile"],
        color="#FAD000",  # Yellow
        description="File creation with metadata query and write",
        min_length=1,
        strict_order=False,
        results=["SUCCESS", "SUCCESS", "SUCCESS", "SUCCESS"]  
    ),

    # Registry manipulation patterns
    SequencePattern(
        name="Registry_Creation_Modification",
        operations=["RegCreateKey", "RegSetValue", "RegQueryKey", "RegCloseKey"],
        color="#34CCFF",  # Cyan
        description="Registry key Create and modification",
        min_length=2,
        strict_order=False,
        results=["SUCCESS", "SUCCESS", "SUCCESS", "SUCCESS"]  
    ),

    SequencePattern(
        name="Registry_Modification",
        operations=["RegOpenKey", "RegSetValue", "RegQueryKey", "RegCloseKey"],
        color="#33CC33",  # Green
        description="Registry key open and modification",
        min_length=2,
        strict_order=False,
        results=["SUCCESS", "SUCCESS", "SUCCESS", "SUCCESS"]  
    ),

    # Network communication patterns
    SequencePattern(
        name="TCP_Communication",
        operations=["TCP Connect", "TCP Send", "TCP Receive", "TCP Disconnect"],
        color="#0066FF",  # Blue
        description="TCP network communication sequence",
        min_length=2,
        strict_order=True,
        results=["SUCCESS", "SUCCESS", "SUCCESS", "SUCCESS"] 
    ),
    
]

In [None]:
# look for CSV files that match the attached dataset pattern
csv_files = natsorted(glob(os.path.join(os.getcwd(), 'Caldera_Ability_Statistics/*_raw_events_with_lineid.csv')))
if csv_files:
    files = csv_files

print(f"Found {len(files)} file(s) to process")
print("Building provenance graphs...")

for idx, f in enumerate(tqdm(files, desc="Processing files")):
    # print(f"\n📁 Processing file {idx+1}/{len(files)}: {os.path.basename(f)}")
    
    if True:
        # Convert CSV rows to the campaign_events format expected by generate_query_graph
        import csv
        from datetime import datetime
        import re
        campaign_events = []
        with open(f, newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            rows = list(reader)
            
            for i, row in enumerate(tqdm(rows, desc="Converting CSV", leave=False)):
                src_name = row.get('Process Name') or row.get('Image Path') or row.get('User') or 'unknown'
                pid = row.get('PID') or row.get('Parent PID') or ''
                try:
                    pid_int = int(pid)
                except:
                    pid_int = 0
                src_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{src_name}_{pid_int}"))
                srcNode = {"UUID": src_uuid, "Name": src_name, "Pid": pid_int, "Type": "Process"}

                event_class = (row.get('Event Class') or '').lower()
                operation = row.get('Operation') or row.get('srcEvent') or 'unknown'
                relation = operation
                dstNode = None
                if 'file' in event_class or 'file system' in event_class:
                    path = row.get('Path') or row.get('Content') or ''
                    dst_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, path)) if path else src_uuid
                    dstNode = {"UUID": dst_uuid, "Name": os.path.basename(path) if path else path, "Type": "File"}
                elif 'registry' in event_class:
                    key = row.get('Path') or row.get('Detail') or ''
                    dst_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, key)) if key else src_uuid
                    dstNode = {"UUID": dst_uuid, "Key": key, "Type": "Registry"}
                elif 'network' in event_class:
                    addr = ''
                    for field in ('Detail','Path','Content','srcEvent'):
                        text = row.get(field,'')
                        if not text:
                            continue
                        m = re.search(r'(?:\b)(?:\d{1,3}\.){3}\d{1,3}(?:\b)', text)
                        if m:
                            addr = m.group(0); break
                    if addr:
                        dst_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, addr))
                        dstNode = {"UUID": dst_uuid, "Dstaddress": addr, "Type": "Network"}
                elif 'process' in event_class:
                    path = row.get('Path') or row.get('Image Path') or ''
                    dst_name = os.path.basename(path.replace('\\', '/').strip()) if path else ''
                    dst_name = dst_name.lstrip('/\\').strip()
                    #print(f"Processing process event: {dst_name} from {path}")
                    op_lower = (operation or '').lower()

                    # Determine destination PID based on operation subtype
                    dst_pid = 0
                    if 'create' in op_lower:  # Process Create
                        details_text = row.get('Detail') or row.get('Details') or ''
                        m = re.search(r'PID\s*:\s*(\d+)', details_text)
                        if m:
                            try:
                                dst_pid = int(m.group(1))
                            except:
                                dst_pid = 0
                        else:
                            # Fallback to PID / Parent PID
                            dst_pid_val = row.get('PID') or row.get('Parent PID') or 0
                            try:
                                dst_pid = int(dst_pid_val)
                            except:
                                dst_pid = 0
                    else:  # Process Start or others
                        dst_pid_val = row.get('PID') or row.get('Parent PID') or 0
                        try:
                            dst_pid = int(dst_pid_val)
                        except:
                            dst_pid = 0
                    #print(f"  Destination PID: {dst_pid}")
                    dst_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{dst_name}_{dst_pid}"))
                    dstNode = {"UUID": dst_uuid, "Name": dst_name, "Pid": dst_pid, "Type": "Process"}

                # timestamp parsing: try common formats, otherwise use row index
                ts = None
                dtstr = row.get('Date & Time') or row.get('Date') or ''
                try:
                    ts = int(datetime.strptime(dtstr, "%m/%d/%Y %I:%M:%S %p").timestamp())
                except:
                    try:
                        ts = int(datetime.strptime(dtstr, "%Y-%m-%d %H:%M:%S").timestamp())
                    except:
                        ts = i

                # include line id if present in CSV
                line_id = row.get('lineid') or row.get('LineID') or row.get('line_id') or ''

                ev = {"srcNode": srcNode, "dstNode": dstNode, "relation": relation, "timestamp": ts, "line_id": str(line_id) if line_id != '' else None}
                ev["label"] = f"CSV_{operation}"
                campaign_events.append(ev)

    # print(f"  ✅ Loaded {len(campaign_events)} events")

    eventname = os.path.basename(f).replace('_raw_events_with_lineid.csv', '')
    save_path = os.path.join(GRAPH_PATH, eventname)
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    G, edge_metadata = generate_query_graph(campaign_events)
    

    # Save graph and metadata
    with open(f"{save_path}/{eventname}.pkl", "wb") as fp:
        pickle.dump(G, fp)
    with open(f"{save_path}/{eventname}_edge_metadata.json", "w") as fp:
        # Convert edge metadata to JSON-serializable format
        json_metadata = {}
        for (src, dst, key), data in edge_metadata.items():
            edge_id = f"{src}→{dst}#{key}"
            json_metadata[edge_id] = {
                'line_id': data['line_id'],
                'operation': data['operation'],
                'timestamp': data['timestamp'],
                'technique': data['technique'],
                'src_process': data['src_process'],
                'src_pid': data['src_pid'],
                'dst_resource': data['dst_resource'],
                'dst_type': data['dst_type']
            }
        json.dump(json_metadata, fp, indent=2)

In [3]:
graph_base_dir = GRAPH_PATH  
available_graphs = []

# Look for .pkl files in subdirectories of Graphs/
for subdir in os.listdir(graph_base_dir):
    subdir_path = os.path.join(graph_base_dir, subdir)
    if os.path.isdir(subdir_path):
        # Look for {eventname}.pkl in each subdirectory
        pkl_file = os.path.join(subdir_path, f"{subdir}.pkl")
        if os.path.exists(pkl_file):
            available_graphs.append(pkl_file)
            print(f"   Found graph: {pkl_file}")

   Found graph: Graphs/a1ee301b0508747b468d578a14e5c1a5/a1ee301b0508747b468d578a14e5c1a5.pkl
   Found graph: Graphs/55678719-e76e-4df9-92aa-10655bbd1cf4/55678719-e76e-4df9-92aa-10655bbd1cf4.pkl
   Found graph: Graphs/b8453a5fe06b24aea12b27592d5c3d3a/b8453a5fe06b24aea12b27592d5c3d3a.pkl
   Found graph: Graphs/dedfa0a54c9c13ce5714a0dc2e1f5d1a/dedfa0a54c9c13ce5714a0dc2e1f5d1a.pkl
   Found graph: Graphs/d0ca00832890baa1d42322cf70fcab1a/d0ca00832890baa1d42322cf70fcab1a.pkl
   Found graph: Graphs/e2af3c3ab1b0f659c874b8af58c49759/e2af3c3ab1b0f659c874b8af58c49759.pkl
   Found graph: Graphs/8fe59e288f10a486dc8b44bc872019ff/8fe59e288f10a486dc8b44bc872019ff.pkl
   Found graph: Graphs/76b6066fe170d38215251102e42be973/76b6066fe170d38215251102e42be973.pkl
   Found graph: Graphs/8467c994685ccf178db166964bd80fab/8467c994685ccf178db166964bd80fab.pkl
   Found graph: Graphs/dce51e632abdfe5392c7c1f942ac9273/dce51e632abdfe5392c7c1f942ac9273.pkl
   Found graph: Graphs/71b3d2945679566b9d94d8cb11df4b70/71b3d2

In [None]:

# Create Interactive Time Visualizations for ALL Available Graphs

print(f"🔍 Creating interactive time visualizations for {len(available_graphs)} graphs")
print("=" * 80)

successful_visualizations = []
failed_visualizations = []

for graph_file in available_graphs:
    graph_basename = os.path.basename(os.path.dirname(graph_file))
    graph_dir = os.path.dirname(graph_file)
    edge_metadata_file = os.path.join(graph_dir, f"{graph_basename}_edge_metadata.json")
    predictions_file = f"Caldera_Ability_Predictions/{graph_basename}.csv"
    
    print(f"\n📊 Processing: {graph_basename}")
    
    # Check if required files exist
    if not os.path.exists(edge_metadata_file):
        print(f"❌ Missing edge metadata file - skipping")
        failed_visualizations.append((graph_basename, "Missing metadata"))
        continue
    
    try:
        # Load the graph and metadata
        G = pickle.load(open(graph_file, "rb"))
        with open(edge_metadata_file, "r") as fp:
            edge_metadata_json = json.load(fp)
        
        # Convert edge metadata
        edge_metadata = {}
        for edge_id, data in edge_metadata_json.items():
            parts = edge_id.split('→')
            if len(parts) == 2:
                src = parts[0]
                dst_key = parts[1].split('#')
                if len(dst_key) == 2:
                    dst = dst_key[0]
                    key = int(dst_key[1]) if dst_key[1].isdigit() else dst_key[1]
                    edge_metadata[(src, dst, key)] = data
        
        # Check if graph has timestamps
        min_ts, max_ts = get_graph_time_range(G, edge_metadata)
        if min_ts is None:
            print(f"   ⚠️  No timestamps found - skipping time visualization")
            failed_visualizations.append((graph_basename, "No timestamps"))
            continue

        total_edges = G.number_of_edges()
        if total_edges == 0:
            print(f"   ⚠️  No edges found - skipping entry visualization")
            failed_entry_visualizations.append((graph_basename, "No edges"))
            continue
        
        # Load predictions if available
        MALICIOUS_SPECS = []
        has_predictions = False
        
        if os.path.exists(predictions_file):
            predictions_df = pd.read_csv(predictions_file)
            has_predictions = True
            
            # Handle different CSV formats
            if 'LineID' in predictions_df.columns and 'Type' in predictions_df.columns and 'Score' in predictions_df.columns:
                malicious_rows = predictions_df[predictions_df['Score'] >= 1.0]
                for _, row in malicious_rows.iterrows():
                    line_id = str(row['LineID'])
                    prediction_type = row['Type'].lower()
                    if prediction_type in ['src', 'dst']:
                        MALICIOUS_SPECS.append((line_id, prediction_type))
                    else:
                        MALICIOUS_SPECS.append((line_id, "both"))
            elif 'line_id' in predictions_df.columns:
                malicious_rows = predictions_df[
                    (predictions_df.get('prediction', 0) == 1) |
                    (predictions_df.get('is_malicious', False) == True) |
                    (predictions_df.get('label', 'benign').str.lower() == 'malicious')
                ]
                for _, row in malicious_rows.iterrows():
                    line_id = str(row['line_id'])
                    MALICIOUS_SPECS.append((line_id, "both"))
        
        print(f"   Entry Count: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
        print(f"   Time range: {datetime.fromtimestamp(min_ts).strftime('%H:%M:%S')} to {datetime.fromtimestamp(max_ts).strftime('%H:%M:%S')}")
        print(f"   Provided Malicious specs: {len(MALICIOUS_SPECS)}")
        
        # Create interactive visualization
        output_time_path = os.path.join(graph_dir, f"interactive_time_{graph_basename}.html")
        output_entry_path = os.path.join(graph_dir, f"interactive_entry_{graph_basename}.html")
        
        # interactive_path = create_interactive_time_visualization(
        #     G, edge_metadata, MALICIOUS_SPECS if MALICIOUS_SPECS else None, 
        #     output_path=output_path
        # )
        interactive_path = create_interactive_entry_visualization(
            G, edge_metadata, MALICIOUS_SPECS if MALICIOUS_SPECS else None, 
            output_path=output_entry_path
        )


        # Calculate some time statistics
        total_duration = max_ts - min_ts
        seconds = total_duration
        
        successful_visualizations.append({
            'graph': graph_basename,
            'path': interactive_path,
            'nodes': G.number_of_nodes(),
            'edges': G.number_of_edges(),
            'duration_seconds': seconds,
            'malicious_specs': len(MALICIOUS_SPECS),
            'has_predictions': has_predictions
        })
        
        print(f"   ✅ Interactive visualization created: {os.path.basename(interactive_path)}")
        print(f"   📊 Duration: {seconds:.1f} seconds, REAPr: {'Yes' if MALICIOUS_SPECS else 'No'}")
        
    except Exception as e:
        print(f"   ❌ Error: {e}")
        failed_visualizations.append((graph_basename, str(e)))
        continue

print(f"\nSuccessful: {len(successful_visualizations)} | Failed: {len(failed_visualizations)}")

if successful_visualizations:
    print(f"\nSUCCESSFUL VISUALIZATIONS:")
    print(f"{'Graph':<35} {'Nodes':<8} {'Edges':<8} {'Duration':<12} {'REAPr':<8} {'File'}")
    print("-" * 90)
    
    for viz in successful_visualizations:
        duration_str = f"{viz['duration_seconds']:.1f}s"
        reapr_str = "Yes" if viz['malicious_specs'] > 0 else "No"
        filename = os.path.basename(viz['path'])
        print(f"{viz['graph']:<35} {viz['nodes']:<8} {viz['edges']:<8} {duration_str:<12} {reapr_str:<8} {filename}")
    
    
if failed_visualizations:
    print(f"\n❌ FAILED VISUALIZATIONS:")
    for graph, reason in failed_visualizations:
        print(f"   {graph}: {reason}")

In [5]:

# Sequence Grouping

target_file = "cfb61005899996469ae3023796792ca5"  # Change this to your target file
graph_file = os.path.join(GRAPH_PATH, target_file, f"{target_file}.pkl")


graph_basename = os.path.basename(os.path.dirname(graph_file))
graph_dir = os.path.dirname(graph_file)
edge_metadata_file = os.path.join(graph_dir, f"{graph_basename}_edge_metadata.json")
predictions_file = f"Caldera_Ability_Predictions/{graph_basename}.csv"

if os.path.exists(graph_file) and os.path.exists(edge_metadata_file):
    
    # Load the graph and metadata
    G = pickle.load(open(graph_file, "rb"))
    with open(edge_metadata_file, "r") as fp:
        edge_metadata_json = json.load(fp)
    
    # Convert JSON edge metadata back to the expected format
    edge_metadata = {}
    for edge_id, data in edge_metadata_json.items():
        parts = edge_id.split('→')
        if len(parts) == 2:
            src = parts[0]
            dst_key = parts[1].split('#')
            if len(dst_key) == 2:
                dst = dst_key[0]
                key = int(dst_key[1]) if dst_key[1].isdigit() else dst_key[1]
                edge_metadata[(src, dst, key)] = data
    
    # Load predictions if available
    MALICIOUS_SPECS = []
    if os.path.exists(predictions_file):
        predictions_df = pd.read_csv(predictions_file)
        
        if 'LineID' in predictions_df.columns and 'Type' in predictions_df.columns and 'Score' in predictions_df.columns:
            malicious_rows = predictions_df[predictions_df['Score'] >= 1.0]
            for _, row in malicious_rows.iterrows():
                line_id = str(row['LineID'])
                prediction_type = row['Type'].lower()
                if prediction_type in ['src', 'dst']:
                    MALICIOUS_SPECS.append((line_id, prediction_type))
                else:
                    MALICIOUS_SPECS.append((line_id, "both"))

    
    sequence_groups = find_sequence_groups(
        G, edge_metadata, ATTACK_SEQUENCE_PATTERNS, 
        max_gap=1, target_file=target_file
    )
    
    # Show details of detected groups
    if sequence_groups:
        print(f"\n📋 DETECTED SEQUENCE GROUPS:")
        for group_id, group_info in list(sequence_groups.items())[:5]:  # Show first 5 groups
            print(f"\n   Group {group_id}: {group_info['pattern'].name}")
            print(f"      Description: {group_info['pattern'].description}")
            print(f"      Confidence: {group_info['confidence']:.2f}")
            print(f"      Edges: {len(group_info['edges'])}")
            print(f"      Operations: {', '.join(group_info['matched_operations'][:3])}{'...' if len(group_info['matched_operations']) > 3 else ''}")
            print(f"      Expected pattern: {', '.join(group_info['pattern'].operations)}")
            print(f"      Strict order: {group_info['pattern'].strict_order}")
            print(f"      Completeness: {group_info.get('completeness', 0):.2f}")
            print(f"      Color: {group_info['pattern'].color}")

    try:
        output_path = os.path.join(graph_dir, f"sequence_grouped_{graph_basename}.html")
        viz_path, groups = create_sequence_grouped_visualization(
            G, edge_metadata, MALICIOUS_SPECS, ATTACK_SEQUENCE_PATTERNS, output_path, sequence_groups
        )
        
        
        # Summary of what was found
        if groups:
            total_grouped_edges = sum(len(g['edges']) for g in groups.values())
            total_edges = G.number_of_edges()
            coverage = (total_grouped_edges / total_edges) * 100
            
            print(f"\n📊 Sequence Coverage:")
            print(f"   Grouped edges: {total_grouped_edges}/{total_edges} ({coverage:.1f}%)")
            print(f"   Ungrouped edges: {total_edges - total_grouped_edges} (shown in gray)")
            
            # Show pattern distribution
            pattern_counts = defaultdict(int)
            for group_info in groups.values():
                pattern_counts[group_info['pattern'].name] += len(group_info['edges'])
            
            print(f"\n🎯 Pattern Distribution:")
            for pattern_name, edge_count in sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True):
                percentage = (edge_count / total_edges) * 100
                print(f"   {pattern_name}: {edge_count} edges ({percentage:.1f}%)")
        
    except Exception as e:
        print(f"❌ Error creating sequence visualization: {e}")
        import traceback
        traceback.print_exc()

else:
    print(f"❌ Required files not found for {target_file}")
    print(f"   Graph file: {graph_file}")
    print(f"   Metadata file: {edge_metadata_file}")


🔍 Result matching: DISABLED
📋 Found 196 unique target pairs (src, dst)
📋 Sample operations from different targets:
   Target ('cmd.exe_7380', 'cmd.exe_7380'): 1 operations
     1. Process Start
   Target ('cmd.exe_7380', 'HKLM\\System\\CurrentControlSet\\Control\\Session Manager_Registry'): 9 operations
     2. RegOpenKey
     3. RegOpenKey
     4. RegCloseKey
     ... and 6 more
   Target ('cmd.exe_7380', 'HKLM\\System\\CurrentControlSet\\Control\\Session Manager\\RaiseExceptionOnPossibleDeadlock_Registry'): 1 operations
     5. RegQueryValue

📍 Analyzing target pair ('cmd.exe_7380', 'cmd.exe_7380') with 1 edges
   🔍 Testing pattern: Process_Creation (min_length: 1)
   🔍 Testing pattern: File_Creation_Write (min_length: 2)
   🔍 Testing pattern: File_Creation_Metadata_Write (min_length: 1)
   🔍 Testing pattern: Registry_Creation_Modification (min_length: 2)
   🔍 Testing pattern: Registry_Modification (min_length: 2)
   🔍 Testing pattern: TCP_Communication (min_length: 2)

📍 Analyzing t