# Exploratory Data Analysis (EDA)
## ServiceNow Incident Auto-Assignment

This notebook performs comprehensive exploratory data analysis on the incident data.

In [None]:
# Import libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.preprocessing import DataLoader
from src.utils import load_config

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load configuration
config = load_config('../config.yaml')
print("Configuration loaded successfully!")

In [None]:
# Load data
data_loader = DataLoader('../config.yaml')

csv_path = Path('../data/raw/') / config['data']['incident_csv']

if csv_path.exists():
    df = data_loader.load_csv(str(csv_path))
else:
    print("Data file not found. Using sample data...")
    from src.preprocessing import load_sample_data
    df = load_sample_data()

print(f"Dataset shape: {df.shape}")

In [None]:
# Display first few rows
df.head()

In [None]:
# Dataset information
df.info()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print("Missing Values:")
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Target variable distribution
target_col = config.get('data', {}).get('target_column', 'assignment_group')

if target_col in df.columns:
    plt.figure(figsize=(12, 6))
    df[target_col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {target_col}', fontsize=16, fontweight='bold')
    plt.xlabel(target_col, fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print(f"\n{target_col} value counts:")
    print(df[target_col].value_counts())

In [None]:
# Text length analysis
text_cols = ['short_description', 'description']
existing_text_cols = [col for col in text_cols if col in df.columns]

if existing_text_cols:
    for col in existing_text_cols:
        df[f'{col}_length'] = df[col].astype(str).str.len()
    
    fig, axes = plt.subplots(1, len(existing_text_cols), figsize=(14, 5))
    if len(existing_text_cols) == 1:
        axes = [axes]
    
    for idx, col in enumerate(existing_text_cols):
        axes[idx].hist(df[f'{col}_length'], bins=50, edgecolor='black')
        axes[idx].set_title(f'{col} Length Distribution')
        axes[idx].set_xlabel('Length')
        axes[idx].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation analysis (for numerical features)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if len(numerical_cols) > 1:
    plt.figure(figsize=(10, 8))
    correlation_matrix = df[numerical_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
    plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
# Save EDA summary
print("\n" + "="*60)
print("EDA Summary")
print("="*60)
print(f"Total Records: {len(df)}")
print(f"Total Features: {len(df.columns)}")
print(f"Missing Values: {df.isnull().sum().sum()}")
print(f"Duplicate Rows: {df.duplicated().sum()}")
if target_col in df.columns:
    print(f"Target Classes: {df[target_col].nunique()}")
print("="*60)