In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Step 1: Load the data
# Download from: https://www.kaggle.com/datasets/ravikumargattu/network-traffic-dataset
df = pd.read_csv('network_traffic.csv')
print(f"Dataset shape: {df.shape}")

# Step 2: EDA - Basic overview
print(df.info())
print(df.describe())
print(df['protocol'].value_counts(normalize=True))  # Categorical distribution

# Visualize numerical distributions
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.histplot(df['packet_size'], kde=True)
plt.title('Packet Size Distribution')
plt.subplot(1, 3, 2)
sns.histplot(df['source_port'], kde=True)
plt.title('Source Port Distribution')
plt.subplot(1, 3, 3)
sns.histplot(df['destination_port'], kde=True)
plt.title('Destination Port Distribution')
plt.tight_layout()
plt.show()

# Categorical distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='protocol', data=df)
plt.title('Protocol Distribution')
plt.show()

# Step 3: Preprocessing
# Select numerical features for clustering
numerical_features = ['packet_size', 'source_port', 'destination_port']
X = df[numerical_features]

# Derive categorical for groupby (e.g., 'internal' if source_ip starts with '192.168')
df['traffic_type'] = np.where(df['source_ip'].str.startswith('192.168'), 'internal', 'external')

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: K-Means Clustering (k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Visualize clusters with PCA (reduce to 2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['cluster'], palette='viridis')
plt.title('K-Means Clusters (PCA Projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

# Step 5: Evaluation - Silhouette Score
sil_score = silhouette_score(X_scaled, df['cluster'])
print(f"Silhouette Score for k=3: {sil_score:.3f}")

# Elbow Method for k selection
inertia = []
sil_scores = []
k_range = range(2, 8)
for k in k_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    kmeans_temp.fit(X_scaled)
    inertia.append(kmeans_temp.inertia_)
    sil_scores.append(silhouette_score(X_scaled, kmeans_temp.labels_))

# Plot Elbow
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(k_range, inertia, 'bo-')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')

# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(k_range, sil_scores, 'ro-')
plt.title('Silhouette Scores')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.tight_layout()
plt.show()

# Step 6: Practical Use - Groupby Categorical and Aggregate Numerical
# Groupby 'protocol' and aggregate on 'packet_size' by cluster
groupby_protocol = df.groupby(['cluster', 'protocol'])['packet_size'].agg(['mean', 'count']).round(2)
print("Average Packet Size by Cluster and Protocol:")
print(groupby_protocol)

# Groupby derived 'traffic_type' and aggregate on 'packet_size' by cluster
groupby_type = df.groupby(['cluster', 'traffic_type'])['packet_size'].agg(['mean', 'count']).round(2)
print("\nAverage Packet Size by Cluster and Traffic Type:")
print(groupby_type)

# Visualize aggregate
plt.figure(figsize=(10, 6))
sns.barplot(data=df, x='cluster', y='packet_size', hue='protocol')
plt.title('Average Packet Size by Cluster and Protocol')
plt.xlabel('Cluster')
plt.ylabel('Average Packet Size')
plt.show()

# Step 7: Advanced - Compare with Different k
# Fit K-means with k=4 (from elbow/silhouette)
kmeans_k4 = KMeans(n_clusters=4, random_state=42)
df['cluster_k4'] = kmeans_k4.fit_predict(X_scaled)

# Silhouette for k=4
sil_k4 = silhouette_score(X_scaled, df['cluster_k4'])
print(f"Silhouette Score for k=4: {sil_k4:.3f}")

# Groupby for k=4
groupby_protocol_k4 = df.groupby(['cluster_k4', 'protocol'])['packet_size'].agg(['mean', 'count']).round(2)
print("Average Packet Size by Cluster (k=4) and Protocol:")
print(groupby_protocol_k4)