In [None]:
import os
import pandas as pd
import warnings
from sklearn.exceptions import InconsistentVersionWarning

# Your imports
from llm2 import LLMAnalyzer
from process4 import GraphProcessor
from graph3 import GraphAnalyzer
from dscreate import DatasetCreation
from deviation import DeviationAnalyzerFrequency
from openai import AzureOpenAI

# Configuration
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=InconsistentVersionWarning)

# Dataset configuration
dataset = 'theia'
dataset_file = f'{dataset}_test_logs.pkl'
lower_bound = "2018-04-10 12:00" 
upper_bound = "2018-04-12 23:55"
interval = 30
sliding_window = 15

api_key=""
endpoint=""
deployment_name=""

os.environ['AZURE_OPENAI_API_KEY'] = api_key
os.environ['AZURE_OPENAI_ENDPOINT'] = endpoint
os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'] = deployment_name

# Directory setup
ANOMALY_DIR = 'anomaly'
PREPROCESSED_DIR = 'preprocessed'

def create_directories():
    """Create necessary directories if they don't exist"""
    for directory in [ANOMALY_DIR, PREPROCESSED_DIR]:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"✓ Created directory: {directory}")

def perform_preprocessing(dataset_name, csv_file, deviation_file, output_file, graph_analyzer):
    """
    Perform anomaly detection and graph analysis on a single file
    
    Args:
        dataset_name: Name of the dataset
        csv_file: Path to input CSV file
        deviation_file: Path to save anomalous logs
        output_file: Path to save processed results
        graph_analyzer: GraphAnalyzer instance
    
    Returns:
        dict: Statistics about processing
    """
    stats = {
        'file': os.path.basename(csv_file),
        'status': 'failed',
        'anomalies_found': 0,
        'processed_rows': 0,
        'error': None
    }
    
    try:
        # Check if input file exists and is not empty
        if not os.path.exists(csv_file):
            stats['error'] = f"File not found: {csv_file}"
            stats['status'] = 'error'
            return stats
            
        # Check file size
        file_size = os.path.getsize(csv_file)
        if file_size == 0:
            print(f"  ⚠️  File is empty, skipping")
            stats['status'] = 'empty_file'
            return stats
        
        # Step 1: Anomaly Detection
        print(f"  [1/3] Running anomaly detection...")
        analyzer = DeviationAnalyzerFrequency(dataset_name, csv_file)
        graph_df = analyzer.analyze()
        
        if graph_df is None or graph_df.empty:
            print(f"  ⚠️  No anomalies detected")
            stats['status'] = 'no_anomalies'
            return stats
        
        stats['anomalies_found'] = len(graph_df)
        print(f"  ✓ Found {stats['anomalies_found']} anomalous entries")
        
        # Save anomalies
        graph_df.to_csv(deviation_file, index=False)
        print(f"  ✓ Saved anomalies to: {os.path.basename(deviation_file)}")
        
        # Step 2: Graph Analysis
        print(f"  [2/3] Running graph analysis...")
        df_filtered = graph_analyzer.analyze(graph_df)
        
        if df_filtered is None or df_filtered.empty:
            print(f"  ⚠️  No data after graph analysis")
            stats['status'] = 'filtered_out'
            return stats
        
        stats['processed_rows'] = len(df_filtered)
        print(f"  ✓ Processed {stats['processed_rows']} rows")
        
        # Step 3: Save Results
        print(f"  [3/3] Saving processed results...")
        df_filtered.to_csv(output_file, index=False)
        print(f"  ✓ Saved to: {os.path.basename(output_file)}")
        
        stats['status'] = 'success'
        
    except Exception as e:
        print(f"  ❌ Error: {str(e)}")
        stats['error'] = str(e)
        stats['status'] = 'error'
        import traceback
        traceback.print_exc()
    
    return stats

def traverse_directory(dataset_name):
    """
    Process all CSV files in the dataset directory
    
    Args:
        dataset_name: Name of the dataset directory
    
    Returns:
        tuple: (processing_stats, graph_analysis_results)
    """
    print("\n" + "="*60)
    print("STARTING ANOMALY DETECTION & PREPROCESSING")
    print("="*60)
    
    # Initialize graph analyzer once
    graph_analyzer = GraphAnalyzer()
    
    # Get all CSV files
    if not os.path.exists(dataset_name):
        print(f"⚠️  Dataset directory '{dataset_name}' not found")
        return [], (None, None)
        
    csv_files = [f for f in os.listdir(dataset_name) if f.endswith('.csv')]
    total_files = len(csv_files)
    
    if total_files == 0:
        print(f"⚠️  No CSV files found in '{dataset_name}' directory")
        return [], (None, None)
    
    print(f"\nFound {total_files} CSV files to process\n")
    
    # Track statistics
    all_stats = []
    
    # Process each file
    for idx, file in enumerate(csv_files, 1):
        print(f"\n{'='*60}")
        print(f"Processing [{idx}/{total_files}]: {file}")
        print(f"{'='*60}")
        
        file_path = os.path.join(dataset_name, file)
        deviation_file = os.path.join(ANOMALY_DIR, f'anomalous_{file}')
        result_file = os.path.join(PREPROCESSED_DIR, f'processed_{file}')
        
        stats = perform_preprocessing(
            dataset_name, 
            file_path, 
            deviation_file, 
            result_file, 
            graph_analyzer
        )
        all_stats.append(stats)
    
    # Finalize graph analysis
    print(f"\n{'='*60}")
    print("FINALIZING GRAPH ANALYSIS")
    print(f"{'='*60}")
    
    try:
        results, results2 = graph_analyzer.finalize_analysis()
        print("✓ Graph analysis finalized successfully")
    except Exception as e:
        print(f"❌ Error finalizing graph analysis: {str(e)}")
        results, results2 = None, None
        import traceback
        traceback.print_exc()
    
    # Print summary
    print_summary(all_stats)
    
    return all_stats, (results, results2)

def print_summary(stats_list):
    """Print summary of processing results"""
    print(f"\n{'='*60}")
    print("PROCESSING SUMMARY")
    print(f"{'='*60}")
    
    total = len(stats_list)
    successful = sum(1 for s in stats_list if s['status'] == 'success')
    no_anomalies = sum(1 for s in stats_list if s['status'] == 'no_anomalies')
    filtered = sum(1 for s in stats_list if s['status'] == 'filtered_out')
    errors = sum(1 for s in stats_list if s['status'] == 'error')
    empty = sum(1 for s in stats_list if s['status'] == 'empty_file')
    
    total_anomalies = sum(s['anomalies_found'] for s in stats_list)
    total_processed = sum(s['processed_rows'] for s in stats_list)
    
    print(f"\nTotal files: {total}")
    print(f"  ✓ Successfully processed: {successful}")
    print(f"  ⚠️  No anomalies found: {no_anomalies}")
    print(f"  ⚠️  Filtered out: {filtered}")
    print(f"  ⚠️  Empty files: {empty}")
    print(f"  ❌ Errors: {errors}")
    
    print(f"\nAnomaly Detection:")
    print(f"  Total anomalies found: {total_anomalies:,}")
    print(f"  Total rows processed: {total_processed:,}")
    
    # Show errors if any
    if errors > 0:
        print(f"\nFiles with errors:")
        for s in stats_list:
            if s['status'] == 'error':
                print(f"  - {s['file']}: {s['error']}")
    
    print(f"{'='*60}\n")

def validate_preprocessed_files():
    """Validate preprocessed files before LLM analysis"""
    print("\nValidating preprocessed files...")
    
    if not os.path.exists(PREPROCESSED_DIR):
        return []
    
    valid_files = []
    invalid_files = []
    
    for file in os.listdir(PREPROCESSED_DIR):
        if not file.endswith('.csv'):
            continue
            
        file_path = os.path.join(PREPROCESSED_DIR, file)
        
        try:
            # Check file size
            if os.path.getsize(file_path) == 0:
                invalid_files.append((file, "Empty file"))
                continue
            
            # Try to read the file
            df = pd.read_csv(file_path)
            
            if df.empty:
                invalid_files.append((file, "No data"))
                continue
            
            # Check for required columns (adjust based on your needs)
            required_cols = ['event', 'processName', 'objectData', 'processUUID']
            missing_cols = [col for col in required_cols if col not in df.columns]
            
            if missing_cols:
                invalid_files.append((file, f"Missing columns: {missing_cols}"))
                continue
            
            valid_files.append(file_path)
            print(f"  ✓ {file}: {len(df)} rows")
            
        except Exception as e:
            invalid_files.append((file, str(e)))
    
    if invalid_files:
        print(f"\n⚠️  Invalid files ({len(invalid_files)}):")
        for file, reason in invalid_files:
            print(f"  - {file}: {reason}")
    
    print(f"\n✓ Found {len(valid_files)} valid preprocessed files")
    return valid_files

def run_llm_analysis():
    """Run LLM analysis on preprocessed data"""
    print("\n" + "="*60)
    print("STARTING LLM ANALYSIS")
    print("="*60)
    
    try:
        # Validate preprocessed files first
        valid_files = validate_preprocessed_files()
        
        if not valid_files:
            print("\n⚠️  No valid preprocessed files found. Skipping LLM analysis.")
            return
        
        # Check Azure OpenAI credentials
        api_key = os.getenv("AZURE_OPENAI_API_KEY")
        endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
        deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
        
        if not all([api_key, endpoint, deployment]):
            print("\n⚠️  Azure OpenAI credentials not set. Skipping LLM analysis.")
            print("Required environment variables:")
            print(f"  - AZURE_OPENAI_API_KEY: {'✓' if api_key else '✗'}")
            print(f"  - AZURE_OPENAI_ENDPOINT: {'✓' if endpoint else '✗'}")
            print(f"  - AZURE_OPENAI_DEPLOYMENT_NAME: {'✓' if deployment else '✗'}")
            return
        
        # Initialize Azure OpenAI client
        print("\nInitializing Azure OpenAI client...")
        client = AzureOpenAI(
            api_key=api_key,  
            api_version="2024-02-01",
            azure_endpoint=endpoint,
        )
        
        # Run LLM analysis
        print("Initializing LLM analyzer...")
        llm_analyzer = LLMAnalyzer(client, deployment)
        analyzer = GraphProcessor(llm_analyzer, graph_file='provenance_graph.gpickle')
        
        print(f"\nAnalyzing {len(valid_files)} files in '{PREPROCESSED_DIR}'...")
        analyzer.analyze_directory(PREPROCESSED_DIR)
        
        print("Finalizing analysis...")
        analyzer.finalize()
        
        print("\n✓ LLM analysis completed successfully")
        
    except AttributeError as e:
        print(f"\n❌ Attribute Error: {str(e)}")
        print("\nThis error typically means:")
        print("  1. A graph object is None when it should have data")
        print("  2. The preprocessed files may not have the expected structure")
        print("  3. The graph analysis returned None")
        print("\nDebugging info:")
        import traceback
        traceback.print_exc()
        
    except Exception as e:
        print(f"\n❌ Error during LLM analysis: {str(e)}")
        print("\nFull traceback:")
        import traceback
        traceback.print_exc()

def main():
    """Main execution function"""
    try:
        # Step 1: Create directories
        print("="*60)
        print("INITIALIZING")
        print("="*60)
        create_directories()
        
        # Step 2: Create dataset
        print("\n" + "="*60)
        print("CREATING DATASET")
        print("="*60)
        dataset_creator = DatasetCreation(
            dataset, 
            dataset_file, 
            lower_bound, 
            upper_bound, 
            interval, 
            sliding_window
        )
        dataset_creator.create_dataset()
        print("✓ Dataset created successfully")
        
        # Step 3: Process files (anomaly detection + graph analysis)
        stats, graph_results = traverse_directory(dataset)
        
        # Step 4: Run LLM analysis if there are processed files
        run_llm_analysis()
        
        # Final message
        print("\n" + "="*60)
        print("✅ ALL PROCESSING COMPLETED")
        print("="*60)
        print(f"Results saved in:")
        print(f"  - Anomalies: {ANOMALY_DIR}/")
        print(f"  - Preprocessed: {PREPROCESSED_DIR}/")
        print("="*60 + "\n")
        
    except KeyboardInterrupt:
        print("\n\n⚠️  Process interrupted by user")
    except Exception as e:
        print(f"\n❌ Critical error: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()