Loadind Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Function to extract the numeric value from the string
def extract_numeric_value(cell):
    if isinstance(cell, str):
        try:
            return int(cell.split(',')[1].replace(']', '').strip())
        except (IndexError, ValueError):
            return np.nan  # Return NaN if parsing fails
    return cell

# Function to load and preprocess dataset
def load_and_preprocess_data(dataset):
    data = pd.read_csv(dataset)

    # Apply the extraction function to all 'Fs' columns 
    fs_columns = [col for col in data.columns if col.startswith('Fs')]
    for col in fs_columns:
        data[col] = data[col].apply(extract_numeric_value)

    # Fill NaN values with 0
    data.fillna(0, inplace=True)

    # Separate the features and the target label
    X = data.drop(columns=['Label'])
    y = data['Label']

    return train_test_split(X, y, test_size=0.2, random_state=42)

# List of datasets
datasets = [
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-15mins.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-30mins.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-1hour.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-2hours.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-4hours.csv',
    'C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-8hours.csv'
]

Plot Graph Function

In [22]:
import matplotlib.pyplot as plt

def plot_accuracy(results):
    """
    Plots the accuracy of different models across multiple datasets.

    Parameters:
    results (list of dict): A list of dictionaries containing dataset names, model names, and their accuracies.
    """
    # Create a DataFrame from the results
    df = pd.DataFrame(results)

    # Set up the plot
    plt.figure(figsize=(12, 6))
    
    # Loop through unique models and plot their accuracy
    for model in df['model'].unique():
        subset = df[df['model'] == model]
        plt.plot(subset['dataset'], subset['accuracy'], marker='o', label=model)

    # Adding titles and labels
    plt.title('Model Accuracy across Different Datasets')
    plt.xlabel('Datasets')
    plt.ylabel('Accuracy (%)')
    plt.xticks(rotation=45)
    plt.ylim(0, 100)  # Set y-axis limits to 0 - 100
    plt.legend(title='Models')
    plt.grid()
    
    # Show the plot
    plt.tight_layout()
    plt.show()



Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import time

results=[]

print("Decision Tree Model")

for dataset in datasets:
    print(f"\nDataset: {dataset}")

    print("\nX_train:")
    print(X_train)
    print("\ny_train:")
    print(y_train)
    print("\nX_test:")
    print(X_test)
    print("\ny_test:")
    print(y_test)
    
    # Load dataset
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset)

    # Decision Tree Model
    clf = DecisionTreeClassifier()

    start_train_time = time.time()
    clf.fit(X_train, y_train)
    end_train_time = time.time()

    start_test_time = time.time()
    y_pred = clf.predict(X_test)
    end_test_time = time.time()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'Decision Tree',
        'accuracy': accuracy * 100
    })

plot_accuracy(results)

Decision Tree - model processing

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import accuracy_score
import time
import pandas as pd
import numpy as np

results=[]

# Function to visualize the decision tree rules
def visualize_tree(clf, feature_names):
    tree_rules = export_text(clf, feature_names=feature_names)
    print("Decision Tree Rules:\n")
    print(tree_rules)

# Function to display a few predictions
def show_predictions(clf, X_test, y_test):
    print("Sample Predictions (First 5):")
    for i in range(5):  # Show first 5 examples
        X_sample = X_test.iloc[i, :].values.reshape(1, -1)  # Get the i-th test sample
        y_true = y_test.iloc[i]  # Actual label
        y_pred = clf.predict(X_sample)[0]  # Predicted label
        print(f"Input: {X_test.iloc[i].tolist()} | Predicted: {y_pred} | Actual: {y_true}")

print("Decision Tree Model")

for dataset in datasets:
    print(f"\n\nDataset: {dataset}")

    # Load dataset
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset)

    # Decision Tree Model
    clf = DecisionTreeClassifier()

    # Train the model
    start_train_time = time.time()
    clf.fit(X_train, y_train)
    end_train_time = time.time()

    # Predict on test set
    start_test_time = time.time()
    y_pred = clf.predict(X_test)
    end_test_time = time.time()

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Visualize the decision tree
    visualize_tree(clf, feature_names=X_train.columns.tolist())

    # Show sample predictions
    show_predictions(clf, X_test, y_test)

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'Decision Tree',
        'accuracy': accuracy * 100
    })

plot_accuracy(results)


Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

results=[]

print("Naive Bayes Model")

for dataset in datasets:
    print(f"\nDataset: {dataset}")

    # Load dataset
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset)

    # Naive Bayes Model
    nb_model = GaussianNB()

    start_train_time = time.time()
    nb_model.fit(X_train, y_train)
    end_train_time = time.time()

    start_test_time = time.time()
    y_pred = nb_model.predict(X_test)
    end_test_time = time.time()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'Naive Bayes',
        'accuracy': accuracy * 100
    })

plot_accuracy(results)


Niave Bayes - model processing

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import time
import pandas as pd
import numpy as np

results = []

# Function to display sample predictions
def show_predictions(nb_model, X_test, y_test):
    print("\nSample Predictions (First 5):")
    for i in range(5):  # Show first 5 examples
        X_sample = X_test.iloc[i, :].values.reshape(1, -1)  # Get the i-th test sample
        y_true = y_test.iloc[i]  # Actual label
        y_pred = nb_model.predict(X_sample)[0]  # Predicted label
        print(f"Input: {X_test.iloc[i].tolist()} | Predicted: {y_pred} | Actual: {y_true}")

# Function to display feature probabilities
def show_feature_probabilities(nb_model, feature_names):
    print("\nNaive Bayes Feature Probabilities (Mean and Variance for Each Feature Per Class):")
    for idx, class_label in enumerate(nb_model.classes_):
        print(f"\nClass: {class_label}")
        print("Feature Means:", nb_model.theta_[idx])  # Means of each feature per class
        print("Feature Variances:", nb_model.var_[idx])  # Variances of each feature per class

print("Naive Bayes Model")

for dataset in datasets:
    print(f"\nDataset: {dataset}")

    # Load dataset
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset)

    # Naive Bayes Model
    nb_model = GaussianNB()

    # Train the model
    start_train_time = time.time()
    nb_model.fit(X_train, y_train)
    end_train_time = time.time()

    # Predict on test set
    start_test_time = time.time()
    y_pred = nb_model.predict(X_test)
    end_test_time = time.time()

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Show feature probabilities for each class
    show_feature_probabilities(nb_model, feature_names=X_train.columns.tolist())

    # Show sample predictions
    show_predictions(nb_model, X_test, y_test)

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'Naive Bayes',
        'accuracy': accuracy * 100
    })

plot_accuracy(results)


SVM 

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

results=[]

print("SVM Model")

for dataset in datasets:
    print(f"\nDataset: {dataset}")

    # Load dataset
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # SVM Model
    svm_model = SVC()

    start_train_time = time.time()
    svm_model.fit(X_train_scaled, y_train)
    end_train_time = time.time()

    start_test_time = time.time()
    y_pred = svm_model.predict(X_test_scaled)
    end_test_time = time.time()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'SVM',
        'accuracy': accuracy * 100
    })

plot_accuracy(results)


SVM - model processing

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import time

results = []

# Function to display sample predictions
def show_predictions(svm_model, X_test, y_test):
    print("\nSample Predictions (First 5):")
    for i in range(5):  # Show first 5 examples
        X_sample = X_test.iloc[i, :].values.reshape(1, -1)  # Get the i-th test sample
        y_true = y_test.iloc[i]  # Actual label
        y_pred = svm_model.predict(X_sample)[0]  # Predicted label
        print(f"Input: {X_test.iloc[i].tolist()} | Predicted: {y_pred} | Actual: {y_true}")

# Function to display support vectors
def show_support_vectors(svm_model, X_train):
    print(f"\nNumber of Support Vectors: {len(svm_model.support_)}")
    print("Support Vectors (First 5):")
    print(X_train[svm_model.support_][:5])  # Display the first 5 support vectors

print("SVM Model")

for dataset in datasets:
    print(f"\nDataset: {dataset}")

    # Load dataset
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # SVM Model
    svm_model = SVC()

    # Train the model
    start_train_time = time.time()
    svm_model.fit(X_train_scaled, y_train)
    end_train_time = time.time()

    # Test the model
    start_test_time = time.time()
    y_pred = svm_model.predict(X_test_scaled)
    end_test_time = time.time()

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Show support vectors
    show_support_vectors(svm_model, X_train_scaled)

    # Show sample predictions
    show_predictions(svm_model, X_test, y_test)

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'SVM',
        'accuracy': accuracy * 100
    })

# Plot accuracy results
plot_accuracy(results)


CNN

In [31]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Conv1D, MaxPooling1D
from keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import time

results = []

print("CNN Model")

for dataset in datasets:
    print(f"\nDataset: {dataset}")

    # Load dataset
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset)

    # Normalize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Reshape data for CNN
    X_train_reshaped = X_train_scaled.reshape(-1, X_train_scaled.shape[1], 1)
    X_test_reshaped = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)

    # Convert labels to categorical
    y_train_categorical = pd.get_dummies(y_train).values
    y_test_categorical = pd.get_dummies(y_test).values

    # CNN Model
    cnn_model = Sequential()
    cnn_model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_reshaped.shape[1], 1)))
    cnn_model.add(MaxPooling1D(pool_size=2))
    cnn_model.add(Flatten())
    cnn_model.add(Dense(256, activation='relu'))
    cnn_model.add(Dropout(0.5))  # Add dropout layer for regularization
    cnn_model.add(Dense(len(np.unique(y_train)), activation='softmax'))

    cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    start_train_time = time.time()
    cnn_model.fit(X_train_reshaped, y_train_categorical, epochs=50, batch_size=64, verbose=0)
    end_train_time = time.time()

    start_test_time = time.time()
    y_pred_cnn = cnn_model.predict(X_test_reshaped)
    y_pred_cnn_classes = np.argmax(y_pred_cnn, axis=1)
    end_test_time = time.time()

    accuracy = accuracy_score(y_test, y_pred_cnn_classes)
    print(f"Accuracy: {accuracy * 100:.2f}% \nTrain Time: {end_train_time - start_train_time:.4f}s \nTest Time: {end_test_time - start_test_time:.4f}s")

    # Store results for plotting
    results.append({
        'dataset': dataset.split('/')[-1],  # Get just the dataset name
        'model': 'CNN',
        'accuracy': accuracy * 100
    })

# Call the plotting function
plot_accuracy(results)


OPTIMIZED CNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils import class_weight

# Load dataset
df = pd.read_csv('C:/Users/Natty PC/Documents/Party/Project II/PreData/Signatures/signatures-1hour.csv')

# Feature extraction function
def parse_feature(feature):
    if pd.isna(feature):
        return [0, 0, 0]  # Handle missing values
    try:
        protocol_port, value = feature.strip("[]").split(", ")
        port = ''.join([c for c in protocol_port if c.isdigit()])
        protocol = ''.join([c for c in protocol_port if c.isalpha()])
        return [int(port) if port.isdigit() else 0, len(protocol), float(value)]
    except:
        return [0, 0, 0]  # Default if parsing fails

# Apply feature parsing
parsed_cols = []
for col in df.columns[1:]:
    parsed_df = df[col].apply(parse_feature).apply(pd.Series)
    parsed_df.columns = [f'{col}_port', f'{col}_protocol_len', f'{col}_value']
    parsed_cols.append(parsed_df)

# Concatenate parsed columns with the label
df_parsed = pd.concat([df['Label']] + parsed_cols, axis=1)

# Label encoding
label_encoder = LabelEncoder()
df_parsed['Label'] = label_encoder.fit_transform(df_parsed['Label'])

# One-hot encode labels
num_classes = len(np.unique(df_parsed['Label']))
print(f'Number of unique classes: {num_classes}')
y = to_categorical(df_parsed['Label'], num_classes=num_classes)

# Split dataset into features (X) and labels (y)
X = df_parsed.drop(columns='Label').values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

# Reshape input data for CNN (samples, timesteps, features)
# Ensure the division is possible without mismatch
timesteps = X_train.shape[1] // 3
if X_train.shape[1] % 3 != 0:
    raise ValueError("Number of features in the dataset is not divisible by 3 for CNN input")

X_train = X_train.reshape(X_train.shape[0], timesteps, 3)
X_test = X_test.reshape(X_test.shape[0], timesteps, 3)

model = Sequential()

model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dropout(0.5))

model.add(Dense(num_classes, activation='softmax'))

# Compile
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Check validation performance during training
history = model.fit(X_train, y_train, epochs=50, batch_size=128, 
                    validation_data=(X_test, y_test), class_weight=class_weights, 
                    callbacks=[early_stopping, lr_scheduler])

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()

# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy}')

import seaborn as sns
import matplotlib.pyplot as plt

label_counts = df_parsed['Label'].value_counts()
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title('Class Distribution')
plt.show()

CNN - PLot Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = np.argmax(model.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()