In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load network traffic data
df = pd.read_csv('network_traffic.csv')
df.fillna(0, inplace=True)

# Feature selection and scaling
features = ['packet_size', 'duration', 'num_packets', 'bytes_sent', 'bytes_received']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Training the Isolation Forest model
isolation_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
isolation_forest.fit(X_scaled)

# Prediction: -1 for anomaly, 1 for normal
df['prediction'] = isolation_forest.predict(X_scaled)
df['prediction'] = df['prediction'].apply(lambda x: 1 if x == -1 else 0)

# Evaluation metrics
print("Anomalies detected:", df['prediction'].sum())
print("Confusion Matrix:\n", confusion_matrix(df['actual'], df['prediction']))
print("Classification Report:\n", classification_report(df['actual'], df['prediction']))

# Plot anomaly scores
df['anomaly_score'] = isolation_forest.decision_function(X_scaled)
plt.figure(figsize=(10, 6))
sns.histplot(df['anomaly_score'], kde=True)
plt.title("Anomaly Score Distribution")
plt.xlabel("Anomaly Score")
plt.show()

# Save the model for future use
import joblib
joblib.dump(isolation_forest, 'data_exfiltration_detector.pkl')
print("Model saved as data_exfiltration_detector.pkl")
