# Insider Threat Detection - Data Exploration

This notebook provides an interactive exploration of the insider threat detection dataset and demonstrates the key features of the system.

## Setup and Imports

In [None]:
import sys
import os
from pathlib import Path

# Add project root to path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import project modules
from src.data.loader import DataLoader
from src.data.preprocessor import DataPreprocessor
from src.data.feature_engineer import FeatureEngineer
from src.evaluation.visualizer import ModelVisualizer

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Setup complete!")

## Data Loading and Basic Exploration

In [None]:
# Initialize data loader
data_loader = DataLoader()

# Load and merge data
try:
    merged_df = data_loader.load_and_merge_data()
    print(f"✅ Data loaded successfully: {len(merged_df)} records")
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("Please ensure your data files are in the correct location.")
    merged_df = None

In [None]:
if merged_df is not None:
    # Basic data info
    print("Dataset Overview:")
    print(f"Shape: {merged_df.shape}")
    print(f"Columns: {list(merged_df.columns)}")
    print("\nFirst few rows:")
    display(merged_df.head())
    
    print("\nData types:")
    display(merged_df.dtypes)
    
    print("\nMissing values:")
    display(merged_df.isnull().sum())

## Data Quality Analysis

In [None]:
if merged_df is not None:
    # Get data summary
    summary = data_loader.get_data_summary()
    
    print("Data Summary:")
    for key, value in summary.items():
        if key != 'activity_types':
            print(f"  {key}: {value}")
    
    print("\nActivity Types Distribution:")
    for activity, count in summary['activity_types'].items():
        print(f"  {activity}: {count} ({count/summary['total_records']*100:.1f}%)")

In [None]:
if merged_df is not None:
    # Validate data quality
    validation_results = data_loader.validate_data_quality()
    
    print(f"Data Quality Validation: {'✅ PASSED' if validation_results['is_valid'] else '❌ ISSUES FOUND'}")
    
    if validation_results['issues']:
        print("\nIssues found:")
        for issue in validation_results['issues']:
            print(f"  - {issue}")
    else:
        print("No data quality issues detected.")

## Exploratory Data Analysis

In [None]:
if merged_df is not None:
    # Activity type distribution
    plt.figure(figsize=(10, 6))
    
    plt.subplot(1, 2, 1)
    activity_counts = merged_df['activity_type'].value_counts()
    plt.pie(activity_counts.values, labels=activity_counts.index, autopct='%1.1f%%')
    plt.title('Activity Type Distribution')
    
    plt.subplot(1, 2, 2)
    activity_counts.plot(kind='bar')
    plt.title('Activity Type Counts')
    plt.xlabel('Activity Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
if merged_df is not None:
    # User activity analysis
    plt.figure(figsize=(12, 8))
    
    # Top users by activity count
    plt.subplot(2, 2, 1)
    top_users = merged_df['user'].value_counts().head(10)
    top_users.plot(kind='bar')
    plt.title('Top 10 Most Active Users')
    plt.xlabel('User')
    plt.ylabel('Activity Count')
    plt.xticks(rotation=45)
    
    # PC usage distribution
    plt.subplot(2, 2, 2)
    top_pcs = merged_df['pc'].value_counts().head(10)
    top_pcs.plot(kind='bar')
    plt.title('Top 10 Most Used PCs')
    plt.xlabel('PC')
    plt.ylabel('Usage Count')
    plt.xticks(rotation=45)
    
    # Unique users and PCs
    plt.subplot(2, 2, 3)
    unique_counts = {
        'Users': merged_df['user'].nunique(),
        'PCs': merged_df['pc'].nunique(),
        'Activities': len(merged_df)
    }
    plt.bar(unique_counts.keys(), unique_counts.values())
    plt.title('Dataset Statistics')
    plt.ylabel('Count')
    
    # Activity distribution by user
    plt.subplot(2, 2, 4)
    user_activity_counts = merged_df.groupby('user').size()
    plt.hist(user_activity_counts, bins=20, alpha=0.7)
    plt.title('Distribution of Activities per User')
    plt.xlabel('Number of Activities')
    plt.ylabel('Number of Users')
    
    plt.tight_layout()
    plt.show()

## Feature Engineering Preview

In [None]:
if merged_df is not None:
    # Initialize preprocessor and feature engineer
    preprocessor = DataPreprocessor()
    feature_engineer = FeatureEngineer()
    
    # Create a copy for processing
    df_processed = merged_df.copy()
    
    try:
        # Basic preprocessing
        df_processed = preprocessor.clean_dates(df_processed)
        df_processed = preprocessor.create_time_features(df_processed)
        df_processed = preprocessor.handle_missing_values(df_processed)
        
        print("✅ Basic preprocessing completed")
        print(f"New features added: {set(df_processed.columns) - set(merged_df.columns)}")
        
    except Exception as e:
        print(f"❌ Error in preprocessing: {e}")
        df_processed = None

In [None]:
if df_processed is not None:
    # Time-based analysis
    plt.figure(figsize=(15, 10))
    
    # Hourly activity distribution
    plt.subplot(2, 3, 1)
    hourly_activity = df_processed['hour'].value_counts().sort_index()
    hourly_activity.plot(kind='bar')
    plt.title('Activity Distribution by Hour')
    plt.xlabel('Hour of Day')
    plt.ylabel('Activity Count')
    
    # Day of week distribution
    plt.subplot(2, 3, 2)
    day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    daily_activity = df_processed['day_of_week'].value_counts().sort_index()
    plt.bar(range(len(daily_activity)), daily_activity.values)
    plt.xticks(range(len(day_names)), day_names)
    plt.title('Activity Distribution by Day of Week')
    plt.xlabel('Day of Week')
    plt.ylabel('Activity Count')
    
    # Weekend vs weekday
    plt.subplot(2, 3, 3)
    weekend_counts = df_processed['is_weekend'].value_counts()
    labels = ['Weekday', 'Weekend']
    plt.pie(weekend_counts.values, labels=labels, autopct='%1.1f%%')
    plt.title('Weekend vs Weekday Activity')
    
    # Off-hours activity
    plt.subplot(2, 3, 4)
    offhours_counts = df_processed['is_off_hours'].value_counts()
    labels = ['Business Hours', 'Off Hours']
    plt.pie(offhours_counts.values, labels=labels, autopct='%1.1f%%')
    plt.title('Business Hours vs Off Hours')
    
    # Cyclical time features
    plt.subplot(2, 3, 5)
    plt.scatter(df_processed['hour_cos'], df_processed['hour_sin'], alpha=0.5)
    plt.title('Cyclical Hour Representation')
    plt.xlabel('Hour Cosine')
    plt.ylabel('Hour Sine')
    
    # Activity timeline (sample)
    plt.subplot(2, 3, 6)
    if len(df_processed) > 1000:
        sample_df = df_processed.sample(1000).sort_values('date')
    else:
        sample_df = df_processed.sort_values('date')
    
    plt.plot(sample_df['date'], range(len(sample_df)), alpha=0.7)
    plt.title('Activity Timeline (Sample)')
    plt.xlabel('Date')
    plt.ylabel('Activity Index')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

## Advanced Feature Engineering

In [None]:
if df_processed is not None:
    try:
        # Advanced feature engineering
        df_processed = preprocessor.sort_and_prepare(df_processed)
        df_processed = feature_engineer.create_user_behavior_features(df_processed)
        df_processed = feature_engineer.encode_categorical_features(df_processed)
        df_processed = feature_engineer.detect_anomalies(df_processed)
        df_processed = feature_engineer.create_threat_labels(df_processed)
        
        print("✅ Advanced feature engineering completed")
        print(f"Total features: {len(df_processed.columns)}")
        print(f"Threat ratio: {df_processed['is_threat'].mean():.3f}")
        
    except Exception as e:
        print(f"❌ Error in advanced feature engineering: {e}")

In [None]:
if df_processed is not None and 'is_threat' in df_processed.columns:
    # Threat analysis
    plt.figure(figsize=(12, 8))
    
    # Threat distribution
    plt.subplot(2, 3, 1)
    threat_counts = df_processed['is_threat'].value_counts()
    labels = ['Normal', 'Threat']
    plt.pie(threat_counts.values, labels=labels, autopct='%1.1f%%')
    plt.title('Threat vs Normal Distribution')
    
    # Anomaly distribution
    plt.subplot(2, 3, 2)
    anomaly_counts = df_processed['anomaly_score'].value_counts()
    labels = ['Normal', 'Anomaly']
    plt.pie(anomaly_counts.values, labels=labels, autopct='%1.1f%%')
    plt.title('Anomaly Distribution')
    
    # User activity entropy distribution
    plt.subplot(2, 3, 3)
    plt.hist(df_processed['activity_entropy'], bins=20, alpha=0.7)
    plt.title('Activity Entropy Distribution')
    plt.xlabel('Entropy')
    plt.ylabel('Frequency')
    
    # Off-hours ratio by threat
    plt.subplot(2, 3, 4)
    threat_offhours = df_processed.groupby('is_threat')['off_hours_ratio'].mean()
    threat_offhours.plot(kind='bar')
    plt.title('Average Off-Hours Ratio by Threat Status')
    plt.xlabel('Threat Status')
    plt.ylabel('Off-Hours Ratio')
    plt.xticks([0, 1], ['Normal', 'Threat'], rotation=0)
    
    # Unique PCs by threat
    plt.subplot(2, 3, 5)
    threat_pcs = df_processed.groupby('is_threat')['unique_pcs'].mean()
    threat_pcs.plot(kind='bar')
    plt.title('Average Unique PCs by Threat Status')
    plt.xlabel('Threat Status')
    plt.ylabel('Unique PCs')
    plt.xticks([0, 1], ['Normal', 'Threat'], rotation=0)
    
    # Feature correlation with threat
    plt.subplot(2, 3, 6)
    numeric_features = ['off_hours_ratio', 'weekend_ratio', 'activity_entropy', 
                       'unique_pcs', 'anomaly_score']
    available_features = [f for f in numeric_features if f in df_processed.columns]
    
    if available_features:
        correlations = df_processed[available_features + ['is_threat']].corr()['is_threat'][:-1]
        correlations.plot(kind='bar')
        plt.title('Feature Correlation with Threat')
        plt.xlabel('Features')
        plt.ylabel('Correlation')
        plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

## Summary and Next Steps

In [None]:
if df_processed is not None:
    print("📊 Data Exploration Summary")
    print("=" * 50)
    print(f"Total Records: {len(df_processed):,}")
    print(f"Total Features: {len(df_processed.columns)}")
    print(f"Unique Users: {df_processed['user'].nunique():,}")
    print(f"Unique PCs: {df_processed['pc'].nunique():,}")
    print(f"Date Range: {df_processed['date'].min()} to {df_processed['date'].max()}")
    
    if 'is_threat' in df_processed.columns:
        print(f"Threat Ratio: {df_processed['is_threat'].mean():.1%}")
        print(f"Anomaly Ratio: {df_processed['anomaly_score'].mean():.1%}")
    
    print("\n🚀 Next Steps:")
    print("1. Run model training: python main.py train")
    print("2. Evaluate model performance: python main.py evaluate --model-path path/to/model.h5")
    print("3. Make predictions: python main.py predict --model-path path/to/model.h5 --input-file data.csv")
    print("4. Try the complete demo: python main.py demo")
else:
    print("❌ Data exploration could not be completed due to data loading issues.")
    print("Please check your data files and try again.")