In [44]:
# Plot Graph
import matplotlib.pyplot as plt
import pandas as pd

def plot_accuracy(results):
    # Create a DataFrame from the results
    df = pd.DataFrame(results)

    # Set up the plot
    plt.figure(figsize=(12, 6))

    # Loop through unique models and plot their accuracy
    for model in df['model'].unique():
        subset = df[df['model'] == model]
        plt.plot(subset['dataset'], subset['accuracy'], marker='o', label=model)

        # Annotate each point with its accuracy value
        for i, row in subset.iterrows():
            plt.text(row['dataset'], row['accuracy'] + 1, f"{row['accuracy']:.2f}%", 
                     ha='center', va='bottom',fontsize=12, color='green')  # Position the text above the point

            # Show training and testing time as a separate text annotation
            time_text = f"Train: {row['train_time']:.4f}s\nTest: {row['test_time']:.4f}s"
            plt.text(row['dataset'], row['accuracy'] - 5, time_text, 
                     ha='center', va='bottom', fontsize=8, color='red')  # Position the text below the point

    # Adding titles and labels
    plt.title('Accuracy and Time across Different Datasets\n\n')
    plt.xlabel('Datasets')
    plt.ylabel('Accuracy (%)')
    plt.xticks(rotation=45)
    plt.ylim(0, 110)  # Set y-axis limits to 0 - 100
    plt.legend(title='Model')
    plt.grid()

    # Show the plot
    plt.tight_layout()
    plt.show()

In [45]:
# DATASETS
datasets = [
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-5mins.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-15mins.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-30mins.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-1hour.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-2hours.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-4hours.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-8hours.csv'
]

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Function to load and preprocess dataset
def load_and_preprocess_data(dataset):
    # Load the dataset
    data = pd.read_csv(dataset)
    
    # Extract all columns that start with 'Fs'
    fs_columns = [col for col in data.columns if col.startswith('Fs')]
    
    # Create a set to collect unique signatures
    signature_set = set()
    
    # Process each 'Fs' column to extract signatures and their frequencies
    for col in fs_columns:
        data[col] = data[col].apply(lambda cell: extract_signature_and_frequency(cell))
        signature_set.update([sig for sig, _ in data[col] if sig])  # Collect unique signatures
    
    # Sort the unique signatures to create a consistent order for columns
    signature_list = sorted(list(signature_set))
    
    # Initialize the feature matrix with zeros
    X_matrix = np.zeros((data.shape[0], len(signature_list)), dtype=int)
    
    # Populate the feature matrix with frequencies
    for row_idx in range(data.shape[0]):
        for col in fs_columns:
            signature, frequency = data.at[row_idx, col]
            if signature in signature_list:
                sig_idx = signature_list.index(signature)
                X_matrix[row_idx, sig_idx] += frequency
    
    # Extract labels
    y = data['Label']
    
    # Encode labels into unique integers
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Debugging: Print unique values of y_encoded
    print("Encoded labels:", np.unique(y_encoded))

    # Adjust y_encoded if the minimum label is greater than 0
    if y_encoded.min() > 0:
        y_encoded -= 1  # Shift labels to start from 0

    # Verify the adjusted labels
    print("Adjusted encoded labels:", np.unique(y_encoded))
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_matrix, y_encoded, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, signature_list, label_encoder


# Function to extract signature and frequency
def extract_signature_and_frequency(cell):
    if isinstance(cell, str):
        try:
            # Split by comma and remove brackets to extract signature and frequency
            signature, frequency = cell.replace('[', '').replace(']', '').split(',')
            return signature.strip(), int(frequency.strip())
        except ValueError:
            return None, 0  # Return None and 0 frequency on error
    return None, 0

In [None]:
# Training and Test Model
import time
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

results = []

# Iterate over the datasets
for dataset in datasets:
    print(f"\nDataset: {dataset}")

    # Load and preprocess the dataset
    X_train, X_test, y_train, y_test, signature_list, label_encoder = load_and_preprocess_data(dataset)

    # Scale the features (optional for XGBoost but may help with performance)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # XGBoost Model
    xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

    # Train the XGBoost model
    start_train_time = time.time()
    xgb_model.fit(X_train_scaled, y_train)
    end_train_time = time.time()

    # Test the XGBoost model
    start_test_time = time.time()
    y_pred = xgb_model.predict(X_test_scaled)
    end_test_time = time.time()

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'XGBoost',
        'accuracy': accuracy * 100,
        'train_time': end_train_time - start_train_time,
        'test_time': end_test_time - start_test_time
    })

# Optionally, plot results using the previously defined plot_accuracy function
plot_accuracy(results)

In [None]:
# # Print the results for validation
# print("X_train matrix:")
# print(X_train)
# print("\nX_test matrix:")
# print(X_test)
# print("\ny_train labels (encoded):")
# print(y_train)
# print("\ny_test labels (encoded):")
# print(y_test)
# print("\nUnique signatures (features):")
# print(signature_list)
# print("\nLabel classes (mapping):")
# print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Function to extract signature and frequency
def extract_signature_and_frequency(cell):
    if isinstance(cell, str):
        try:
            signature, frequency = cell.replace('[', '').replace(']', '').split(',')
            return signature.strip(), int(frequency.strip())
        except ValueError:
            return None, 0  # Return None and 0 frequency on error
    return None, 0

# Function to load and preprocess dataset
def load_and_preprocess_data(dataset):
    # Load the dataset
    data = pd.read_csv(dataset)
    
    # Print unique labels to verify
    print("Unique labels in dataset:", data['Label'].unique())
    
    # Extract all columns that start with 'Fs'
    fs_columns = [col for col in data.columns if col.startswith('Fs')]
    
    # Create a set to collect unique signatures
    signature_set = set()
    
    # Process each 'Fs' column to extract signatures and their frequencies
    for col in fs_columns:
        data[col] = data[col].apply(lambda cell: extract_signature_and_frequency(cell))
        signature_set.update([sig for sig, _ in data[col] if sig])  # Collect unique signatures
    
    # Sort the unique signatures to create a consistent order for columns
    signature_list = sorted(list(signature_set))
    
    # Initialize the feature matrix with zeros
    X_matrix = np.zeros((data.shape[0], len(signature_list)), dtype=int)
    
    # Populate the feature matrix with frequencies
    for row_idx in range(data.shape[0]):
        for col in fs_columns:
            signature, frequency = data.at[row_idx, col]
            if signature in signature_list:
                sig_idx = signature_list.index(signature)
                X_matrix[row_idx, sig_idx] += frequency
    
    # Extract labels
    y = data['Label']
    
    # Create a consistent mapping for labels
    all_labels = ['Attack', 'Benign', 'C&C-HeartBeat', 'DDoS', 'Okiru', 'PortScan', 'PortScan-Attack']
    label_mapping = {label: idx for idx, label in enumerate(all_labels)}
    
    # Map labels to integers, using only the labels present in the dataset
    y_encoded = np.array([label_mapping[label] for label in y if label in label_mapping])

    # Print debugging information
    print("Label mapping:", label_mapping)
    print("Encoded labels:", np.unique(y_encoded))

    # Check dimensions
    print("X_matrix shape:", X_matrix.shape)
    print("y_encoded shape:", y_encoded.shape)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_matrix, y_encoded, test_size=0.2, random_state=42)

    # Ensure y_train and y_test are consistent with label mapping
    if len(set(y_encoded)) < len(all_labels):
        y_train = np.array([label_mapping[label] for label in y_train if label in label_mapping])
        y_test = np.array([label_mapping[label] for label in y_test if label in label_mapping])
    
    return X_train, X_test, y_train, y_test, signature_list

# Sample datasets list for testing
datasets = [
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-5mins.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-15mins.csv'
]

# Training and Test Model
results = []

# Iterate over the datasets
for dataset in datasets:
    print(f"\nDataset: {dataset}")

    # Load and preprocess the dataset
    X_train, X_test, y_train, y_test, signature_list = load_and_preprocess_data(dataset)

    # Scale the features (optional for XGBoost but may help with performance)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # XGBoost Model
    xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)

    # Train the XGBoost model
    start_train_time = time.time()
    xgb_model.fit(X_train_scaled, y_train)
    end_train_time = time.time()

    # Test the XGBoost model
    start_test_time = time.time()
    y_pred = xgb_model.predict(X_test_scaled)
    end_test_time = time.time()

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'XGBoost',
        'accuracy': accuracy * 100,
        'train_time': end_train_time - start_train_time,
        'test_time': end_test_time - start_test_time
    })

# Optionally, plot results using the previously defined plot_accuracy function
# plot_accuracy(results)  # Uncomment to plot results



Dataset: C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-5mins.csv
Unique labels in dataset: ['Benign' 'C&C-HeartBeat' 'DDoS' 'Okiru' 'PortScan' 'Attack'
 'PortScan-Attack']
Label mapping: {'Attack': 0, 'Benign': 1, 'C&C-HeartBeat': 2, 'DDoS': 3, 'Okiru': 4, 'PortScan': 5, 'PortScan-Attack': 6}
Encoded labels: [0 1 2 3 4 5 6]
X_matrix shape: (1447, 127)
y_encoded shape: (1447,)
Accuracy: 100.00% 
Train Time: 0.1074s 
Test Time: 0.0020s

Dataset: C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-15mins.csv
Unique labels in dataset: ['Benign' 'C&C-HeartBeat' 'DDoS' 'Okiru' 'PortScan' 'Attack'
 'PortScan-Attack']
Label mapping: {'Attack': 0, 'Benign': 1, 'C&C-HeartBeat': 2, 'DDoS': 3, 'Okiru': 4, 'PortScan': 5, 'PortScan-Attack': 6}
Encoded labels: [0 1 2 3 4 5 6]
X_matrix shape: (487, 127)
y_encoded shape: (487,)


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5], got [1 2 3 4 5 6]