# CyberIntent-AI: Data Exploration Notebook

This notebook demonstrates data exploration and feature engineering for the CyberIntent-AI system.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load and Explore Sample Data

In [None]:
# Load sample data
df = pd.read_csv('data/sample_logs.csv')

print(f"Data shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print(f"\nBasic statistics:")
df.describe()

In [None]:
# Visualize anomaly score distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(df['anomaly_score'], bins=30, edgecolor='black', alpha=0.7)
plt.axvline(x=0.7, color='red', linestyle='--', linewidth=2, label='Threshold')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Anomaly Score Distribution')
plt.legend()

plt.subplot(1, 2, 2)
df['intent_label'].value_counts().plot(kind='bar')
plt.xlabel('Intent Label')
plt.ylabel('Count')
plt.title('Intent Label Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 2. Feature Analysis

In [None]:
# Analyze key features
numeric_cols = ['bytes_sent', 'bytes_received', 'duration', 'failed_logins', 'successful_logins']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    axes[idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
numeric_df = df[numeric_cols]
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Engineer new features
df['total_bytes'] = df['bytes_sent'] + df['bytes_received']
df['bytes_ratio'] = df['bytes_sent'] / (df['bytes_received'] + 1)
df['login_ratio'] = df['failed_logins'] / (df['successful_logins'] + 1)
df['packet_rate'] = df['bytes_sent'] / (df['duration'] + 0.01)

print("Engineered features created:")
print(df[['total_bytes', 'bytes_ratio', 'login_ratio', 'packet_rate']].head())
print(f"\nNew data shape: {df.shape}")

In [None]:
# Analyze engineered features
print("\nEngineered Features Statistics:")
engineered_cols = ['total_bytes', 'bytes_ratio', 'login_ratio', 'packet_rate']
print(df[engineered_cols].describe())

## 4. Threat Analysis

In [None]:
# Compare anomalies by intent label
threat_comparison = df.groupby('intent_label').agg({
    'anomaly_score': ['mean', 'max', 'std'],
    'bytes_sent': 'mean',
    'failed_logins': 'mean'
}).round(3)

print("Threat Type Comparison:")
print(threat_comparison)

In [None]:
# Visualize threat patterns
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Anomaly scores by threat type
df.boxplot(column='anomaly_score', by='intent_label', ax=axes[0])
axes[0].set_title('Anomaly Scores by Threat Type')
axes[0].set_xlabel('Intent Label')
axes[0].set_ylabel('Anomaly Score')
plt.sca(axes[0])
plt.xticks(rotation=45)

# Bytes transferred by threat type
df.boxplot(column='total_bytes', by='intent_label', ax=axes[1])
axes[1].set_title('Total Bytes by Threat Type')
axes[1].set_xlabel('Intent Label')
axes[1].set_ylabel('Total Bytes')
plt.sca(axes[1])
plt.xticks(rotation=45)

plt.suptitle('')  # Remove the automatic title
plt.tight_layout()
plt.show()

## 5. Summary and Insights

In [None]:
print("Data Exploration Summary:")
print(f"=" * 50)
print(f"Total events: {len(df)}")
print(f"Normal events: {(df['intent_label'] == 'benign').sum()}")
print(f"Anomalous events: {(df['anomaly_score'] > 0.7).sum()}")
print(f"\nThreat breakdown:")
print(df['intent_label'].value_counts())
print(f"\nFeature range analysis:")
print(f"- Anomaly scores: {df['anomaly_score'].min():.3f} to {df['anomaly_score'].max():.3f}")
print(f"- Bytes sent: {df['bytes_sent'].min():.0f} to {df['bytes_sent'].max():.0f}")
print(f"- Failed logins: {df['failed_logins'].min():.0f} to {df['failed_logins'].max():.0f}")