# CyberIntent-AI: Model Training Experiments

This notebook demonstrates model training, evaluation, and experimentation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load and Prepare Data

In [None]:
# Load sample data
df = pd.read_csv('data/sample_logs.csv')

# Prepare features
feature_cols = ['src_port', 'dst_port', 'bytes_sent', 'bytes_received', 'duration', 
                'failed_logins', 'successful_logins']
X = df[feature_cols]
y = df['intent_label']

print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print(f"\nIntent distribution:")
print(y.value_counts())

## 2. Split Data

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nTraining set intent distribution:")
print(y_train.value_counts())
print(f"\nTest set intent distribution:")
print(y_test.value_counts())

## 3. Train Anomaly Detector

In [None]:
from sklearn.ensemble import IsolationForest

# Train Isolation Forest
anomaly_detector = IsolationForest(contamination=0.1, random_state=42)
anomaly_detector.fit(X_train)

# Get predictions
y_pred_train = anomaly_detector.predict(X_train)
y_pred_test = anomaly_detector.predict(X_test)

# Get anomaly scores
y_scores_train = anomaly_detector.decision_function(X_train)
y_scores_test = anomaly_detector.decision_function(X_test)

print("Anomaly Detector Training Complete")
print(f"Training predictions (-1=anomaly, 1=normal): {np.unique(y_pred_train)}")
print(f"Anomaly rate in training: {(y_pred_train == -1).sum() / len(y_pred_train):.2%}")
print(f"Anomaly rate in test: {(y_pred_test == -1).sum() / len(y_pred_test):.2%}")

## 4. Train Intent Predictor

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder

# Encode target labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Train Gradient Boosting Classifier
intent_predictor = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42
)
intent_predictor.fit(X_train, y_train_encoded)

# Get predictions
y_pred_train_intent = intent_predictor.predict(X_train)
y_pred_test_intent = intent_predictor.predict(X_test)

# Get probabilities
y_proba_train = intent_predictor.predict_proba(X_train)
y_proba_test = intent_predictor.predict_proba(X_test)

print("Intent Predictor Training Complete")
print(f"\nTraining accuracy: {(y_pred_train_intent == y_train_encoded).mean():.3f}")
print(f"Test accuracy: {(y_pred_test_intent == y_test_encoded).mean():.3f}")

## 5. Model Evaluation

In [None]:
# Classification report for intent predictor
print("Classification Report - Intent Predictor:")
print("="*50)
print(classification_report(y_test_encoded, y_pred_test_intent, target_names=le.classes_))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test_encoded, y_pred_test_intent)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix - Intent Predictor')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Feature Importance

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': intent_predictor.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance - Intent Predictor')
plt.tight_layout()
plt.show()

## 7. Model Scores Distribution

In [None]:
# Visualize anomaly score distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Anomaly scores
axes[0].hist(y_scores_train, bins=30, alpha=0.7, label='Train', edgecolor='black')
axes[0].hist(y_scores_test, bins=30, alpha=0.7, label='Test', edgecolor='black')
axes[0].set_xlabel('Anomaly Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Anomaly Score Distribution')
axes[0].legend()

# Intent prediction confidence
max_proba_train = np.max(y_proba_train, axis=1)
max_proba_test = np.max(y_proba_test, axis=1)

axes[1].hist(max_proba_train, bins=30, alpha=0.7, label='Train', edgecolor='black')
axes[1].hist(max_proba_test, bins=30, alpha=0.7, label='Test', edgecolor='black')
axes[1].set_xlabel('Max Probability')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Intent Prediction Confidence Distribution')
axes[1].legend()

plt.tight_layout()
plt.show()

## 8. Summary

In [None]:
print("Model Training Summary")
print("="*60)
print(f"\nAnomalies Detected: {(y_pred_test == -1).sum()} out of {len(y_pred_test)} events")
print(f"Intent Prediction Accuracy: {(y_pred_test_intent == y_test_encoded).mean():.1%}")
print(f"\nAverage Anomaly Score: {y_scores_test.mean():.3f}")
print(f"Average Intent Confidence: {max_proba_test.mean():.3f}")
print(f"\nTop 3 Important Features:")
for idx, row in feature_importance.head(3).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")