# Alert 

## The data used is uploaded from my personal drive, you need to find a way to the data read.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from matplotlib.colors import LinearSegmentedColormap
from sklearn.feature_selection import mutual_info_classif

warnings.filterwarnings('ignore')

In [3]:
# Set the style of the plot to a dark background
sns.set_style("whitegrid")

# Customize the seaborn style with specific colors and context settings
sns.set(rc={
    "axes.facecolor": "#f2e5ff",  # Set the face color of the axes to a light violet
    "figure.facecolor": "#f2e5ff",  # Set the face color of the entire figure to a light violet
    "grid.color": "#e0c3ff"  # Set the color of the grid lines to a soft violet
})

# Set the context to "poster" size with a specific font scale
sns.set_context("poster", font_scale=0.7)

# Define custom color palettes for seaborn plots
palette = ["#8a2be2", "#9370db", "#ba55d3", "#dda0dd", "#ee82ee"]
palette_cmap = ["#9370db", "#ba55d3", "#8a2be2", "#dda0dd", "#ee82ee"]

# Data Exploration

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
# Function to create features

class TransformationModel():
    def __init__(self):

        self.scaler = None
        self.pca = None


def create_features(data , transformation_model , features_set):
    train = data.copy()
    # Feature Engineering
    train["Speed of Operations to Speed of Data Bytes"] = train["sbytes"] / train["dbytes"]
    train["Time for a single Process"] = train["dur"] / train["spkts"]
    train["Ratio of data flow"] = train["dbytes"] / train["sbytes"]
    train["Ratio of Packet flow"] = train["dpkts"] / train["spkts"]
    train["Total Page Errors"] = train["dur"] * train["sloss"]
    train["Network Usage"] = train["sbytes"] + train["dbytes"]
    train["Network Activity Rate"] = train["spkts"] + train["dpkts"]
    train["Page Fault Rate"] = (train["sload"] + train["dload"]) / train["sloss"]
    train["Network Latency"] = train["spkts"] - train["dpkts"]
    train["Disk Latency"] = (train["sload"] + train["dload"]) / train["sbytes"]

    # Convert categorical features into dummy variables
    train = train.drop(columns='attack_cat')
    train_numeric = pd.get_dummies(train, columns=['proto', 'service', 'state'], drop_first=True)

    # Separate features and labels
    features = train_numeric.drop(columns=[col for col in train_numeric.columns if not col in features_set])
    labels = train_numeric['label']

    # Replace infinities and handle NaN values
    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    features.dropna(inplace=True)
    labels = labels.loc[features.index]

    # Standardize the features
    if not transformation_model.scaler :
      scaler = MinMaxScaler()
      scaled_features = scaler.fit_transform(features)
      transformation_model.scaler = scaler
    else :
      scaled_features= transformation_model.scaler.transform(features)

    return pd.DataFrame(scaled_features, columns=features.columns), labels


def perform_pca_and_visualize_3d(features, labels, tranformation_model, n_components=3, width=1400, height=800):


  # Apply PCA
  if not tranformation_model.pca:
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(features)
    tranformation_model.pca = pca
  else:
    pca_result = tranformation_model.pca.transform(features)
  # Get the PCA component loadings for the first three principal components
  components = tranformation_model.pca.components_[:3]  # First three components
  feature_names = features.columns  # Assuming `features` is a DataFrame with named columns

  # Print the formulas for PCA1, PCA2, and PCA3
  for i, component in enumerate(components, 1):
    formula = " + ".join([f"{coef:.2f}*{name}" for coef, name in zip(component, feature_names)])
    print(f"PCA{i} = {formula}")
  # Create a DataFrame for PCA results
  pca_df = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(n_components)])
  pca_df['label'] = labels.values

  # Interactive 3D plot with Plotly, with centered layout, custom width and height, and smaller marker size
  fig = px.scatter_3d(
      pca_df, x='PC1', y='PC2', z='PC3', color=pca_df['label'].astype(str),
      title="Interactive 3D PCA of Network Data", labels={"color": "Label"}
  )
  fig.update_traces(marker=dict(size=3))  # Set marker size to 3 for smaller points
  fig.update_layout(scene=dict(
                      xaxis_title='Principal Component 1',
                      yaxis_title='Principal Component 2',
                      zaxis_title='Principal Component 3'),
                    margin=dict(l=0, r=0, b=0, t=40),
                    width=width,  # Use width parameter
                    height=height,  # Use height parameter
                    title_x=0.5)  # Center the title and plot
  fig.show()

  # Concatenate the PCA components to the original features DataFrame
  features_with_pca = pd.concat([features.reset_index(drop=True), pca_df.iloc[:, :-1].reset_index(drop=True)], axis=1)

  return features_with_pca



import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap

def plot_correlation_heatmap(data , label ):
    data['label'] = label
    corr_matrix = data.corr(method='spearman')**2

    # Define custom colormap with colors for positive, negative, and neutral correlations
    colors = ["#8a2be2", "#ffffff", "#ff6db0"]  # Positive to negative: violet to white to pink
    cmap = LinearSegmentedColormap.from_list("Custom", colors, N=256)

    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    # Plot heatmap with adjustments for readability
    plt.figure(figsize=(30, 24), dpi=150)  # Increased figure size and DPI
    sns.heatmap(corr_matrix, mask=mask,annot=True, cmap=cmap, fmt='.2f',
                vmin=0, vmax=1, center=0, square=True, cbar_kws={'label': 'Correlation'},
                linecolor='white', linewidths=0.5)

    # Customize title and tick labels
    plt.title('Correlation Heatmap', fontsize=20, weight='bold', color='#4b0082')
    plt.xticks(rotation=45, ha='right', fontsize=20)  # Smaller font size for better fit
    plt.yticks(rotation=0, fontsize=20)

    # Show plot with improved layout
    plt.tight_layout()
    plt.show()

# Example Usage
# Load your data


In [None]:
train = pd.read_parquet('/content/drive/MyDrive/archive/UNSW_NB15_training-set.parquet')
all_features_set = set(train.columns )
transformation_model = TransformationModel()
train = train.drop(columns='attack_cat')
train_numeric = pd.get_dummies(train, columns=['proto', 'service', 'state'], drop_first=True)

# Separate features and labels
features = train_numeric.drop(columns=[col for col in train_numeric.columns if not col in all_features_set or col =="label"])
labels = train_numeric['label']

# Replace infinities and handle NaN values
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)
labels = labels.loc[features.index]
# Apply feature engineering and preprocessing
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)
all_features = pd.DataFrame(scaled_features, columns=features.columns)
all_labels = labels
all_features_with_pca = perform_pca_and_visualize_3d(all_features, all_labels, transformation_model  ,  n_components=3)
print(all_features.columns )

In [None]:
train = pd.read_parquet('/content/drive/MyDrive/archive/UNSW_NB15_training-set.parquet')
transformation_model = TransformationModel()
final_features_set = set(['dload', 'dmean','response_body_len', 'Time for a single Process', 'Ratio of data flow', 'Total Page Errors', 'Network Usage', 'Network Activity Rate', 'Page Fault Rate', 'Network Latency'])

# Apply feature engineering and preprocessing
features, labels = create_features(train , transformation_model , final_features_set )

# Perform PCA and visualize in 3D
features_with_pca = perform_pca_and_visualize_3d(features, labels, transformation_model  ,  n_components=3)

features_with_pca.columns

In [None]:
# Plot correlation heatmap for numerical features
plot_correlation_heatmap(features, labels)

In [61]:

# Apply feature engineering and preprocessing
test = pd.read_parquet('/content/drive/MyDrive/archive/UNSW_NB15_testing-set.parquet')
features_test, labels_test = create_features(test, transformation_model, final_features_set )

# Perform PCA and visualize in 3D
features_with_pca_test = perform_pca_and_visualize_3d(features_test, labels_test, transformation_model )



PCA1 = 0.40*dload + 0.89*dmean + 0.01*response_body_len + -0.02*ct_dst_sport_ltm + -0.02*Time for a single Process + 0.24*Ratio of data flow + -0.00*Total Page Errors + 0.02*Network Usage + 0.03*Network Activity Rate + 0.00*Page Fault Rate + -0.01*Network Latency
PCA2 = 0.90*dload + -0.33*dmean + -0.01*response_body_len + -0.01*ct_dst_sport_ltm + -0.03*Time for a single Process + -0.29*Ratio of data flow + -0.00*Total Page Errors + -0.03*Network Usage + -0.03*Network Activity Rate + 0.01*Page Fault Rate + 0.01*Network Latency
PCA3 = 0.18*dload + -0.31*dmean + 0.04*response_body_len + 0.20*ct_dst_sport_ltm + 0.17*Time for a single Process + 0.88*Ratio of data flow + 0.03*Total Page Errors + 0.13*Network Usage + 0.12*Network Activity Rate + 0.02*Page Fault Rate + -0.04*Network Latency


In [None]:
merged = pd.concat([train , test], axis=0).reset_index(drop=True)
merged_transformation_model = TransformationModel()
merged = create_features(merged , merged_transformation_model, final_features_set )
merged_features = merged[0]
merged_labels = merged[1]

merged_features_with_pca = perform_pca_and_visualize_3d(merged_features, merged_labels, merged_transformation_model  ,  n_components=3)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Split data into training and testing sets
X_train, X_dev, y_train, y_dev = train_test_split(features_with_pca, labels, test_size=0.1, random_state=42)

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # Use 'binary:logistic' for binary classification, or 'multi:softprob' for multiclass
    eval_metric='logloss',        # Evaluation metric; 'logloss' for binary classification
    use_label_encoder=False       # Set this to False to avoid label encoding warnings in XGBoost
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_dev)

# Evaluate the model
accuracy = accuracy_score(y_dev, y_pred)
f1 = f1_score(y_dev, y_pred, average='weighted')  # Weighted average for multiclass or binary classification
report = classification_report(y_dev, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(report)

# Confusion Matrix
conf_matrix = confusion_matrix(y_dev, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=True, yticklabels=True)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

# Summary report
print("\nPerformance Summary:")
print(f"- Accuracy: {accuracy:.2f}")
print(f"- F1 Score: {f1:.2f}")
print(f"- Confusion Matrix:\n{pd.DataFrame(conf_matrix)}")

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

X_train, X_dev, y_train, y_dev = train_test_split(features_with_pca, labels, test_size=0.1, random_state=42)

num_positive = sum(y_train == 1)
num_negative = sum(y_train == 0)
scale_pos_weight = num_negative / num_positive

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False
)

xgb_model.fit(X_train, y_train)

y_pred_proba = xgb_model.predict_proba(X_dev)[:, 1]

threshold = 0.3
y_pred = (y_pred_proba > threshold).astype(int)

accuracy = accuracy_score(y_dev, y_pred)
f1 = f1_score(y_dev, y_pred, average='weighted')
recall = recall_score(y_dev, y_pred, pos_label=1)
report = classification_report(y_dev, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Recall (Class 1): {recall:.2f}")
print("Classification Report:")
print(report)

conf_matrix = confusion_matrix(y_dev, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=True, yticklabels=True)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

print("\nPerformance Summary:")
print(f"- Accuracy: {accuracy:.2f}")
print(f"- F1 Score: {f1:.2f}")
print(f"- Recall (Class 1): {recall:.2f}")
print(f"- Confusion Matrix:\n{pd.DataFrame(conf_matrix)}")
