In [13]:
import pyshark
import pandas as pd
import os
import glob
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import joblib

The files necessary for download can be found at: https://www.kaggle.com/datasets/cicdataset/cicids2017

## User Input Cell for data aggregation function

In [2]:
#Enter whether file formats for aggregation are PCAP or CSV
PCAP_OR_CSV = 'CSV'
#Give absolute file path where all file(s) are stored
file_path = r'C:\Users\17272\Desktop\ML_Datasets'
#if your file has a name you'd like to refrence in the data set and know where it can be parsed from the in file path, 
#give the position of the file name
split_file_name = 5

## Function for Aggregating Data in CSV or PCAP and outputing dataframe

In [3]:
def pcap_or_csv_to_pandas(file_type,file_path,str_split = 'N/A'):
    #If user wants to keep the file name that the data originated from
    if file_type == 'CSV' and str_split != 'N/A':
        absolute_file_path = file_path + '\\*.csv'
        file_list = glob.glob(absolute_file_path)
        PCAP_DF = pd.DataFrame()
        for file in file_list:
            file_name = file.split('\\')[str_split]
            temp_df = pd.read_csv(file)
            temp_df['File_Name'] = file_name
            PCAP_DF = pd.concat([PCAP_DF,temp_df], ignore_index = True)
        PCAP_DF.columns = PCAP_DF.columns.str.strip()
        return PCAP_DF
    #Name of file omitted from DF
    elif file_type == 'CSV' and str_split == 'N/A':
        absolute_file_path = file_path + '\\*.csv'
        file_list = glob.glob(absolute_file_path)
        PCAP_DF = pd.DataFrame()
        for file in file_list:
            temp_df = pd.read_csv(file)
            PCAP_DF = pd.concat([PCAP_DF,temp_df], ignore_index = True)
        PCAP_DF.columns = PCAP_DF.columns.str.strip()
        return PCAP_DF
    #If files need to be aggregated from PCAP files instead of CSV
    elif file_type == 'PCAP':
        absolute_file_path = file_path + '\\*.pcap'
        pcap_list = glob.glob(absolute_file_path)
        for pcap in pcap_list:
            cap = pyshark.FileCapture(pcap)
            data = []
            for packet in cap:
                packet_info = {}
                # Extract desired information from each packet
                try:
                    packet_info['timestamp'] = packet.sniff_timestamp
                    packet_info['source_ip'] = packet.ip.src
                    packet_info['destination_ip'] = packet.ip.dst
                    packet_info['protocol'] = packet.transport_layer
                    packet_info['length'] = packet.length
                    # Add other fields as needed
                except AttributeError:
                    # Skip if an attribute is not found in the packet
                    continue
                data.append(packet_info)
                cap.close()
            return pd.DataFrame(data)
    else:
        print("Please enter valid file type: CSV/PCAP")

In [4]:
def clean_dataset(df):
    # Select columns that are either float or int
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    # Check for NaN and infinite values in these columns
    mask = df[numeric_cols].applymap(lambda x: np.isnan(x) or np.isinf(x))
    
    # Drop rows where any of these columns have NaN or infinite values
    df_cleaned = df[~mask.any(axis=1)]
    
    return df_cleaned

In [5]:
#Function Call for aggregating files
PCAP_DF = pcap_or_csv_to_pandas(PCAP_OR_CSV,file_path)
#Function call for removing rows with NaN or infinite values
PCAP_DF_Cleaned = clean_dataset(PCAP_DF)

In [10]:
#Creat a copy of dataset to ensure integirty of original dataset is kept
PCAP_DF_Simplifed = PCAP_DF_Cleaned.copy()
#Simplifying dataset to either indicate a row item is benign or malicious
PCAP_DF_Simplifed['BENIGN/ATTACK'] = np.where(PCAP_DF_Simplifed['Label'] != 'BENIGN', 'ATTACK', 'BENIGN')
#dropping Label column from dataset since BENIGN/ATTACK column was created
PCAP_DF_Simplifed = PCAP_DF_Simplifed.drop(columns = ['Label'])

In [1]:
#input dataset for training by dropping 'BENIGN/ATTACK'
#X = PCAP_DF_Simplifed.drop(columns = ['BENIGN/ATTACK'])

#Values/output from dataset is 'BENIGN/ATTACK'
#y = PCAP_DF_Simplifed['BENIGN/ATTACK']

#splitting datset into 2, 80% is to train the model and 20% is to test the predictions of the model
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#Using a decision tree machine learning model and fitting it using the X and y training sets
#model = DecisionTreeClassifier()
#model.fit(X_train,y_train)

#Testing the predictions of the model
#predictions = model.predict(X_test)
#score = accuracy_score(y_test, predictions)
#score

In [2]:
#After training the model I export the it so I don't have to continually train it and can just load it to make predictions
#joblib.dump(model, 'CYBER-ATTACK_PREDICTIONS.joblib')

#This loads the exported model above, I can continue to aggregate datasets and make predictions based on the trained model 
#as long as all columns of dataset loaded are the same 
#model = joblib.load('CYBER-ATTACK_PREDICTIONS.joblib')