In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
# Import libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
directory = 'C:\\Users\\ktv07101\\Desktop\\BHNI Anomaly Detection Related\\DDoS Training Data\\CIC_DDoS2019\\publicCSV\\CSV-01-12\\01-12'

train tree based on just one table and test on output

In [39]:
filename = "DrDoS_DNS.csv"
drDos_DNSdf = pd.read_csv(os.path.join(directory, filename), low_memory=False, dtype={'Flow Bytes/s':"float64", ' Flow Packets/s':"float64"})

In [40]:
drDos_DNSdf.head()

Unnamed: 0.1,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
0,425,172.16.0.5-192.168.50.1-634-60495-17,172.16.0.5,634,192.168.50.1,60495,17,2018-12-01 10:51:39.813448,28415,97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
1,430,172.16.0.5-192.168.50.1-60495-634-17,192.168.50.1,634,172.16.0.5,60495,17,2018-12-01 10:51:39.820842,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,DrDoS_DNS
2,1654,172.16.0.5-192.168.50.1-634-46391-17,172.16.0.5,634,192.168.50.1,46391,17,2018-12-01 10:51:39.852499,48549,200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
3,2927,172.16.0.5-192.168.50.1-634-11894-17,172.16.0.5,634,192.168.50.1,11894,17,2018-12-01 10:51:39.890213,48337,200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS
4,694,172.16.0.5-192.168.50.1-634-27878-17,172.16.0.5,634,192.168.50.1,27878,17,2018-12-01 10:51:39.941151,32026,200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_DNS


In [41]:
categorical_col = [" Label", " Source IP", " Destination IP", ' Source Port', ' Destination Port', " Timestamp", "Flow ID", "SimillarHTTP"]
numerical_col = [col for col in drDos_DNSdf.columns if col not in categorical_col]

In [42]:
y = drDos_DNSdf[' Label']
X = drDos_DNSdf[numerical_col]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=16, test_size =0.2)

In [44]:
dtc = DecisionTreeClassifier()

In [45]:
X_train.columns

Index(['Unnamed: 0', ' Protocol', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Varian

In [46]:
# Get the 'Unnamed: 0' column and any columns containing inf values
filtered_df = X_train[['Unnamed: 0'] + list(X_train.columns[X_train.isin([np.inf, -np.inf]).any()])]

# Filter for rows where any value is inf
filtered_df = filtered_df[filtered_df.isin([np.inf, -np.inf]).any(axis=1)]

print(filtered_df)

         Unnamed: 0  Flow Bytes/s   Flow Packets/s
4451475       16168           inf              inf
202336        25223           inf              inf
2316933        2924           inf              inf
4437819       24947           inf              inf
164473        15430           inf              inf
...             ...           ...              ...
3248210       14900           inf              inf
2090467       10034           inf              inf
1518754       19917           inf              inf
1331697       27084           inf              inf
1060795       12233           inf              inf

[129997 rows x 3 columns]


In [47]:

# Access by label using .loc
selected_df = drDos_DNSdf.loc[4451475, ['Flow Bytes/s', ' Flow Packets/s']]
print(selected_df)


Flow Bytes/s       inf
 Flow Packets/s    inf
Name: 4451475, dtype: object


In [48]:
X_train[['Flow Bytes/s',   ' Flow Packets/s']]

Unnamed: 0,Flow Bytes/s,Flow Packets/s
1460118,2.896000e+09,2000000.0
2678584,2.944000e+09,2000000.0
3261322,2.944000e+09,2000000.0
331338,2.928000e+09,2000000.0
4028702,2.944000e+09,2000000.0
...,...,...
2920900,2.944000e+09,2000000.0
98939,2.944000e+09,2000000.0
4293189,1.472000e+09,1000000.0
2079982,2.944000e+09,2000000.0


In [11]:
y_train

1460118    DrDoS_DNS
2678584    DrDoS_DNS
3261322    DrDoS_DNS
331338     DrDoS_DNS
4028702    DrDoS_DNS
             ...    
2920900    DrDoS_DNS
98939      DrDoS_DNS
4293189    DrDoS_DNS
2079982    DrDoS_DNS
2726569    DrDoS_DNS
Name:  Label, Length: 4059530, dtype: object

In [32]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [12]:
def fit_dtc_in_chunks(X_train, y_train, chunksize):
    for i in range(0, len(X_train), chunksize):
        X_chunk = X_train[i:i+chunksize]
        y_chunk = y_train[i:i+chunksize]

        try:
            dtc.fit(X_chunk, y_chunk)
        except ValueError as e:
            if "too large" in str(e):
                print("i value is:", i)
                print("Violating DataFrame:")
                violating_indices = np.where(np.isinf(X_chunk) | np.isneginf(X_chunk))
                print(X_chunk.iloc[violating_indices])  # Use .iloc for integer-based indexing
                print(X_chunk)
            else:
                raise e

# Set chunk size and fit the model in chunks
chunksize = 5000  # Adjust chunk size based on memory availability
fit_dtc_in_chunks(X_train, y_train, chunksize)

i value is: 0
Violating DataFrame:
         Flow Bytes/s   Flow Packets/s  Flow Bytes/s   Flow Packets/s  \
4451475           inf              inf           inf              inf   
4451475           inf              inf           inf              inf   
202336            inf              inf           inf              inf   
202336            inf              inf           inf              inf   
2316933           inf              inf           inf              inf   
...               ...              ...           ...              ...   
2223879           inf              inf           inf              inf   
468185            inf              inf           inf              inf   
468185            inf              inf           inf              inf   
809006            inf              inf           inf              inf   
809006            inf              inf           inf              inf   

         Flow Bytes/s   Flow Packets/s  Flow Bytes/s   Flow Packets/s  \
4451475        

ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

loop through all the CSVs and combine them - not working, will do later if needed

In [5]:
# Specify the directory containing the CSV files
directory = 'C:\\Users\\ktv07101\\Desktop\\BHNI Anomaly Detection Related\\DDoS Training Data\\CIC_DDoS2019\\publicCSV\\CSV-01-12\\01-12'

# List of CSV file names
file_names = [
    "DrDoS_DNS.csv",
    "DrDoS_LDAP.csv",
    "DrDoS_MSSQL.csv",
    "DrDoS_NetBIOS.csv",
    "DrDoS_NTP.csv",
    "DrDoS_SNMP.csv",
    "DrDoS_SSDP.csv",
    "DrDoS_UDP.csv",
    "Syn.csv",
    "TFTP.csv",
    "UDPLag.csv"
]

# Read CSV files in chunks to reduce memory usage
def read_csv_chunkwise(filename):
    chunksize = 10000  # Adjust chunk size as needed
    reader = pd.read_csv(os.path.join(directory, filename), chunksize=chunksize, low_memory=False)
    return pd.concat(reader, ignore_index=True)

# Combine dataframes using chunkwise reading
combined_df = pd.concat([read_csv_chunkwise(file) for file in file_names])

print(combined_df)

MemoryError: Unable to allocate 3.90 GiB for an array with shape (26, 20107827) and data type float64

Use RandomForestRegressor from sklearn to calculate the importance of each feature in the dataset.