# K-Means Clustering and PCA Analysis of Intrusion Detection Network Data

In [None]:
#-----This code performs clustering and classification of attack types using K-Means and PCA on KDD Cup 1999 dataset----------------

# To start 
!pip install feature_engine

# Importing Essential Libraries for Advanced Data Analysis

In [None]:
import pandas as pd 
import numpy as np
from feature_engine.selection import DropDuplicateFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import os
from IPython.display import display  # Used for displaying dataframes and other outputs
import pandas as pd  # Data manipulation and analysis
from tensorflow.keras.utils import get_file  # Utilities for Keras, such as downloading files
from sklearn import metrics  # Machine learning evaluation metrics
from scipy.stats import zscore 
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import random
import time
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# Define the file path
file_path = r"C:\home\data\kddcup99_csv.csv"

# Read the CSV file using the relative path
df = pd.read_csv(file_path)

# Exploring Target Labels and Dataset Overview

In [None]:

# ---------------------check target----------------
df["label"].unique()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import pandas as pd
from tensorflow.keras.utils import get_file

try:
      path = get_file('kddcup99_csv.csv', origin='file:///C:/home/data/kddcup99_csv.csv')
except:
    print('Error downloading')
    raise
print("Read {} rows.".format(len(df)))
print('='*40)
print('The number of data points are:', df.shape[0])
print('='*40)
print('The number of features are:', df.shape[1])
print('='*40)
output = df['label'].values
labels = set(output)
print('The different type of output labels are:', labels)
print('='*125)
print('The number of different output labels are:', len(labels))


# Identifying Duplicate Rows

In [None]:
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows:", len(duplicate_rows))
print("Duplicate rows:")
print(duplicate_rows)

# Clean Data and Remove nulls

In [None]:
#--------------Data Cleaning----------------------------------

print('Null values in dataset are',len(df[df.isnull().any(1)]))
print('='*40)

# Checking for NULL values
print('Null values in dataset are',len(df[df.isnull().any(1)]))
print('='*40)


# Checkng for DUPLICATE values
df.drop_duplicates(keep='first', inplace = True)

# For now, just drop NA's (rows with missing values)
df.dropna(inplace=True,axis=1) 

#stored the data into a pickle file so we can load through
df.to_pickle('df.pkl')

print("Read {} rows.".format(len(df)))

# Print the entire dataset after removing rows with NULL values
print(df.head())

# Print the entire dataset after removing rows with NULL values
print(df)

# Remove Duplicates

In [None]:

# Removing duplicate rows from the DataFrame
df.drop_duplicates(inplace=True)

# Display the DataFrame 'df' after removing duplicates
display(df)


# Quantifying Data Points

In [None]:
# Check the number of data points
num_data_points = df.shape[0]
print("Number of data points:", num_data_points)


# Attack Type distribution

In [None]:

# Calculate class distribution
class_distribution = df['label'].value_counts()

unique_classes = df['label'].unique()
print("Unique classes:", unique_classes)

sorted_yi = np.argsort(-class_distribution.values)
for i in sorted_yi:
    print('Number of data points in class', class_distribution.index[i],':', class_distribution.values[i], 
          '(', np.round((class_distribution.values[i] / df.shape[0] * 100), 3), '%)')

# Identify Unique class labels

In [None]:
unique_classes = df['label'].unique()
print("Unique classes:", unique_classes)

# Distribution Bar Chart Plot

In [None]:

# Extract the class distribution
class_distribution = df['label'].value_counts()

# Plot the class distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='label', order=class_distribution.index)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.xticks(rotation=45)
plt.show()


# Distribution of Percentage Values

In [None]:
sorted_yi = np.argsort(-class_distribution.values)
for i in sorted_yi:
    print('Number of data points in class', class_distribution.index[i],':', class_distribution.values[i], 
          '(', np.round((class_distribution.values[i]/df.shape[0]*100), 3), '%)')


# Attack Distribution Count

In [None]:
#----------------attck distribution-------------------------
class_distribution = df['label'].value_counts()
sorted_yi = class_distribution.index

#Calculating percentages
percentages = np.round((class_distribution / df.shape[0] * 100), 3)

# Create a DataFrame for the table
table_data = pd.DataFrame({'Class': sorted_yi, 'Count': class_distribution, 'Percentage': percentages})

#Display the table
table_data = table_data.sort_values(by='Count', ascending=False).reset_index(drop=True)
table_data

# Descriptive Statistics: Z-Score Analysis

In [None]:

def analyze(df):
    # calculating z-scores for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number])
    z_scores = numeric_cols.apply(zscore)

    # Create a DataFrame for the analysis results
    analysis_results = pd.DataFrame({'Column': numeric_cols.columns, 'Z-Score Mean': z_scores.mean(), 'Z-Score Std': z_scores.std()})

    # Display the analysis results in a table
    display(analysis_results)

# Call the analyze function with the provided DataFrame 'df'
analyze(df)

# Categorical Data Analysis

In [None]:

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v, round(100 * (s[v] / t), 2)))
    return "[{}]".format(",".join(result))
        
def analyze(df):
    cols = df.columns.values
    total = float(len(df))

    result_data = []  # Store the analysis results in a list of dictionaries

    for col in cols:
        col_analysis = {}
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count > 100:
            col_analysis['Column'] = col
            col_analysis['UniqueCount'] = unique_count
            col_analysis['Percentage'] = int((unique_count / total) * 100)
        else:
            col_analysis['Column'] = col
            col_analysis['Categories'] = expand_categories(df[col])
        result_data.append(col_analysis)

    # Create a DataFrame from the results
    result_df = pd.DataFrame(result_data)

    return result_df


# Perform the analysis
analysis_result = analyze(df)

# Display the analysis results as a table
display(analysis_result)


# Text Encoding and Label Distribution Analysis

In [None]:

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

x_columns = df.columns.drop('label')
x = df[x_columns].values
dummies = pd.get_dummies(df['label']) # Classification
outcomes = dummies.columns
num_classes = len(outcomes)
y = dummies.values


df.groupby('label')['label'].count()

# Data Encoding and Normalization for Network Traffic Features

In [None]:
# Define the encoding functions
def encode_numeric_zscore(df, column_name):
    df[column_name] = (df[column_name] - df[column_name].mean()) / df[column_name].std()

def encode_text_dummy(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df, dummies], axis=1)
    df.drop(columns=[column_name], inplace=True)
    return df

# Apply the encoding functions to the DataFrame 'df'
encode_numeric_zscore(df, 'duration')
#encode_numeric_zscore(df, 'protocol_type')
#encode_numeric_zscore(df, 'service')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_numeric_zscore(df, 'lnum_compromised')  # Corrected column name
encode_numeric_zscore(df, 'lroot_shell')       # Corrected column name
encode_numeric_zscore(df, 'lsu_attempted')  
encode_numeric_zscore(df, 'num_failed_logins')
#encode_numeric_zscore(df, 'logged_in')
encode_numeric_zscore(df, 'lnum_shells')
encode_numeric_zscore(df, 'num_failed_logins')
encode_numeric_zscore(df, 'lnum_access_files')
encode_numeric_zscore(df, 'lnum_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')

In [None]:
df.dropna(inplace=True,axis=1)
df[0:494020]
# This is the numeric feature vector, as it goes to the neural net

# Class Probability Analysis with One-Hot Encoding
This code  handles categorical columns through one-hot encoding. After training the model, it prints the average probabilities for each class

In [None]:
numeric_columns = df.select_dtypes(include='number')

# Perform one-hot encoding on categorical columns
categorical_columns = df.select_dtypes(exclude='number')
encoded_columns = pd.get_dummies(categorical_columns)

# Concatenate numeric and encoded categorical columns
X = pd.concat([numeric_columns, encoded_columns], axis=1)
y = df['label']

# Train the logistic regression model
model = LogisticRegression()
model.fit(X, y)

# Predict probabilities for all instances in the dataset
probabilities = model.predict_proba(X)

# Calculate the average probabilities for each class
average_probabilities = probabilities.mean(axis=0)

# Display the average probabilities
class_labels = model.classes_
for label, prob in zip(class_labels, average_probabilities):
    print(f'Average probability for {label}: {prob}')

# Binary Representation of Attack Types
This can be useful for inspecting how the one-hot encoding transformed the categorical variable into a format suitable for machine learning models.

In [None]:

# Perform one-hot encoding on the 'label' column
label_dummies = pd.get_dummies(df['label'])

# Print the resulting dummy variables
print(label_dummies)


# K-Means Clustering on Numeric Features

In [None]:
# Select the features for clustering, excluding non-numeric columns
numeric_columns = df.select_dtypes(include='number').columns
features = df[numeric_columns].copy()

# standardisation 
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(scaled_features)
labels = kmeans.labels_
df['cluster'] = labels
print(df.head())
print(df)


# Visualizing K-Means Clustering Across Multiple Numeric Features

In [None]:
numeric_columns = df.select_dtypes(include='number').columns
features = df[numeric_columns].copy()
num_features = len(features.columns)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(scaled_features)
labels = kmeans.labels_
df['cluster'] = labels

# Choose a random subset of features to plot
num_features_to_plot = min(num_features, 10)  # Adjust the number as needed
random.seed(42)
random_features = random.sample(features.columns.tolist(), num_features_to_plot)

for i in range(num_features_to_plot - 1):
    for j in range(i + 1, num_features_to_plot):
        plt.figure(figsize=(6, 6))
        plt.scatter(scaled_features[:, features.columns.get_loc(random_features[i])],
                    scaled_features[:, features.columns.get_loc(random_features[j])],
                    c=labels, cmap='viridis')
        plt.xlabel(random_features[i])
        plt.ylabel(random_features[j])
        plt.title(f"Scatter plot of {random_features[i]} vs. {random_features[j]}")
        plt.show()
        time.sleep(2)  # Pause for 2 seconds before displaying the next plot


# Network Attack Classification

In [None]:
# Define the feature(s) you want to use for clustering
X = df[['duration','src_bytes', 'dst_bytes','dst_host_srv_count', 'dst_host_same_srv_rate','wrong_fragment', 'logged_in', 'srv_count', 
            'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
            'dst_host_srv_rerror_rate']]

# Perform K-means clustering with k=4
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)

# Add cluster labels to the DataFrame
df['cluster'] = kmeans.labels_

# Create a figure with subplots for each feature
fig, axes = plt.subplots(5, 3, figsize=(15, 20))
axes = axes.flatten()

# Define cluster label mapping
cluster_mapping = {
    0: 'smurf',
    1: 'neptune',
    2: 'normal',
    3: 'others'
}

# Define colors for each cluster
cluster_colors = {
    'smurf': 'blue',
    'neptune': 'red',
    'normal': 'green',
    'others': 'gray'
}

# Iterate over each feature and plot against cluster labels
for i, feature in enumerate(X.columns):
    scatter = axes[i].scatter(X[feature], df['cluster'], c=df['cluster'].map(cluster_mapping).map(cluster_colors))
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Attack Labels')
    axes[i].set_title(f' {feature} vs. attack labels')
    axes[i].grid(True)

# Create a single legend for all plots
legend_scatter = []
for cluster_label, color in cluster_colors.items():
    legend_scatter.append(plt.Line2D([0], [0], marker='o', color='w', label=cluster_label, markerfacecolor=color, markersize=10))

# Add legend outside the subplots on top
fig.legend(handles=legend_scatter, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=len(cluster_colors))

plt.tight_layout()
plt.show()


# 3D Visualization of Network Attack Types with PCA and K-Means Clustering

In [None]:
# Encode the 'label' column
label_mapping = {
    'smurf': 'smurf',
    'neptune': 'neptune',
    'normal': 'normal',
}

# Apply the mapping to the encoded 'label' column
df['label_encoded'] = df['label'].map(label_mapping).fillna('others')

# Define the features you want to use for clustering
features = ['duration', 'src_bytes', 'dst_bytes']  # Add more features if needed
X = df[features]

# Perform K-means clustering with k=4
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)

# Add cluster labels to the DataFrame
df['cluster'] = kmeans.labels_

# Standardize the data
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Perform PCA with three components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_std)

# Create an interactive 3D scatter plot using Plotly Express
fig = px.scatter_3d(df, x=X_pca[:, 0], y=X_pca[:, 1], z=X_pca[:, 2], color='cluster', labels={'color': 'Cluster'},
                    title='Interactive 3D Scatter Plot of DOS Attacks with K-means Clusters')

# Customize the appearance of the plot
fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))

# Show the plot
fig.show()


# PCA Visualization of Network Attack Types in 3D

In [None]:
# Select the numerical columns for PCA
numeric_columns = df.select_dtypes(include='number').columns
selected_data_numeric = df[numeric_columns].copy()

# Standardization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_data_numeric)

# Perform PCA to reduce dimensions to 3D
pca = PCA(n_components=3)
pca_features = pca.fit_transform(scaled_data)

# Get the labels from the original DataFrame
labels = df['label'].values

# Calculate counts for each label
label_counts = df['label'].value_counts()

# Define color map based on label counts
label_color_map = {
    label: 'blue' if label == 'smurf' else 'red' if label == 'neptune' else 'green' if label == 'normal' else 'gray'
    for label in label_counts.index.tolist()
}

# Assign colors to each data point based on its label
colors = [label_color_map[label] for label in labels]

# Plot the 3D PCA result with different colors for different labels
fig = plt.figure(figsize=(7, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_features[:, 0], pca_features[:, 1], pca_features[:, 2], c=colors)
ax.set_xlabel('pca 1', fontsize=12)
ax.set_ylabel('pca 2', fontsize=12)
ax.set_zlabel('pca 3', fontsize=12)
ax.set_title('')

legend_elements = [
    plt.Line2D([0], [0], marker='o', color='w', label='smurf', markerfacecolor='blue', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', label='neptune', markerfacecolor='red', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', label='normal', markerfacecolor='green', markersize=10),
    plt.Line2D([0], [0], marker='o', color='w', label='others', markerfacecolor='gray', markersize=10)
]
ax.legend(handles=legend_elements, loc='upper right')

# Reconstruction of Data (for Anomaly Detection)
reconstructed_data = np.dot(pca_features, pca.components_)

# Calculation of Reconstruction Error (for Anomaly Detection)
reconstruction_error = np.linalg.norm(scaled_data - reconstructed_data, ord=2)

print("Reconstruction Error:", reconstruction_error

# Adjust subplot layout to center
plt.tight_layout()

plt.show()

# Multi-angle 3D PCA Visualization of Network Attacks

In [None]:
# Select the numerical columns for PCA
numeric_columns = df.select_dtypes(include='number').columns
selected_data_numeric = df[numeric_columns].copy()

# Standardization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_data_numeric)

# Perform PCA to reduce dimensions to 3D
pca = PCA(n_components=3)
pca_features = pca.fit_transform(scaled_data)

# Get the labels from the original DataFrame
label_color_map = {
    'smurf': 'blue',
    'neptune': 'red',
    'normal': 'green',
    'others': 'gray',
}

# Create a list of colors for each data point based on its label, with a default color of 'gray'
colors = [label_color_map.get(label, 'gray') for label in df['label'].values]

# Create the 3D PCA plot from different angles
fig = plt.figure(figsize=(15, 10))

# Define the viewpoints for the plots
view_points = [
    (30, 30),   # Default front view
    (120, 45),  # Side view
    (100, 180),    # Bottom view
    (45, 360),  # Rear view
]

for i, (elev, azim) in enumerate(view_points):
    ax = fig.add_subplot(2, 2, i + 1, projection='3d')
    ax.scatter(pca_features[:, 0], pca_features[:, 1], pca_features[:, 2], c=colors)
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_zlabel('PC3')
    ax.set_title(f'')
    ax.view_init(elev=elev, azim=azim)

plt.suptitle('', fontsize=20)
plt.tight_layout()
plt.show()


# 3D Interactive Visualization

In [None]:
# Select the numerical columns for PCA
numeric_columns = df.select_dtypes(include='number').columns
selected_data_numeric = df[numeric_columns].copy()

# Standardization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(selected_data_numeric)

# Perform PCA to reduce dimensions to 3D
pca = PCA(n_components=3)
pca_features = pca.fit_transform(scaled_data)

# Get the labels from the original DataFrame
label_color_map = {
    'smurf': 'blue',
    'neptune': 'red',
    'normal': 'green',
    'others': 'gray',
}

# Create a list of colors for each data point based on its label, with a default color of 'gray'
colors = [label_color_map.get(label, 'gray') for label in df['label'].values]

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(pca_features, columns=['PC1', 'PC2', 'PC3'])
pca_df['Label'] = df['label']

# Plot the 3D PCA result with plotly
fig = px.scatter_3d(
    pca_df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='Label',
    color_discrete_map=label_color_map,
    title='3D PCA'
)

fig.update_layout(scene = dict(
                    xaxis_title='Principal Component 1',
                    yaxis_title='Principal Component 2',
                    zaxis_title='Principal Component 3'))
fig.show()
