In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt

In [None]:
 # !pip install scikit-learn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Intrusion Detection/datasets/header_UNSW-NB15_1.csv')

  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Intrusion Detection/datasets/header_UNSW-NB15_1.csv')


In [None]:
print(df.head(3))

        srcip  sport          dstip dsport proto state       dur  sbytes  \
0  59.166.0.0  33661  149.171.126.9   1024   udp   CON  0.036133     528   
1  59.166.0.6   1464  149.171.126.7     53   udp   CON  0.001119     146   
2  59.166.0.5   3593  149.171.126.5     53   udp   CON  0.001209     132   

   dbytes  sttl  ...  Ct_ftp_cmd  Ct_srv_src  Ct_srv_dst Ct_dst_itm  \
0     304    31  ...           0           2           4          2   
1     178    31  ...           0          12           8          1   
2     164    31  ...           0           6           9          1   

   Ct_src_itm  Ct_src_dsport_it  Ct_dst_sport_itm  Ct_dst_sport_itm.1  \
0           3                 1                 1                   2   
1           2                 2                 1                   1   
2           1                 1                 1                   1   

   Ct_dst_sport_itm.2  Label  
0                 NaN      0  
1                 NaN      0  
2                 NaN   

In [None]:
# Fill NaN values in the target column with a specific label for normal instances
df['Ct_dst_sport_itm.2'] = df['Ct_dst_sport_itm.2'].fillna('Normal')

# Skip columns you want to exclude
columns_to_skip = ['sport', 'dsport', 'Ct_dst_sport_itm.2']  # Replace with the names of columns to skip
df = df.drop(['sport', 'dsport'], axis=1)

In [None]:
# Encoding categorical columns
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
label_encoders = {}
categorical_columns = df.select_dtypes(include=['object']).columns
# Exclude the target column from the list of columns to encode
categorical_columns = [col for col in categorical_columns if col != 'Ct_dst_sport_itm.2']
for col in categorical_columns:
    print(col)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

srcip
dstip
proto
state
Service


In [None]:
# Separate the dataset into normal and attack samples
normal_data = df[df['Ct_dst_sport_itm.2'] == 'Normal']
attack_data = df[df['Ct_dst_sport_itm.2'] != 'Normal']

In [None]:
normal_data.shape
attack_data.shape

(22215, 47)

In [None]:
#########
# Splitting the dataset such that training set contains 80% samples from each class
train_data = pd.DataFrame()
test_data = pd.DataFrame()

unique_classes = attack_data['Ct_dst_sport_itm.2'].unique()

for cls in unique_classes:
    class_data = df[df['Ct_dst_sport_itm.2'] == cls]
    print(cls)
    print(class_data.shape)
    class_train, class_test = train_test_split(class_data, train_size=0.8, random_state=42)
    train_data = pd.concat([train_data, class_train])
    print(f"80 percent number: {train_data.shape}")
    test_data = pd.concat([test_data, class_test])



Exploits
(5409, 47)
80 percent number: (4327, 47)
Reconnaissance
(1759, 47)
80 percent number: (5734, 47)
DoS
(1167, 47)
80 percent number: (6667, 47)
Generic
(7522, 47)
80 percent number: (12684, 47)
Shellcode
(223, 47)
80 percent number: (12862, 47)
 Fuzzers
(5051, 47)
80 percent number: (16902, 47)
Worms
(24, 47)
80 percent number: (16921, 47)
Backdoors
(534, 47)
80 percent number: (17348, 47)
Analysis
(526, 47)
80 percent number: (17768, 47)


In [None]:
# Splitting into X and y
X_train_attack = train_data.drop('Ct_dst_sport_itm.2', axis=1)
y_train_attack = train_data['Ct_dst_sport_itm.2']
X_test_attack = test_data.drop('Ct_dst_sport_itm.2', axis=1)
y_test_attack = test_data['Ct_dst_sport_itm.2']

print('X_train shape:', X_train_attack.shape)
print('y_train shape:', y_train_attack.shape)
print('X_test shape:', X_test_attack.shape)
print('y_test shape:', y_test_attack.shape)

#####

X_train shape: (17768, 46)
y_train shape: (17768,)
X_test shape: (4447, 46)
y_test shape: (4447,)


In [None]:
# Split the normal samples into training and test sets (80% train, 20% test)
X_normal = normal_data.drop('Ct_dst_sport_itm.2', axis=1)
y_normal = normal_data['Ct_dst_sport_itm.2']
X_train_normal, X_test_normal, y_train_normal, y_test_normal = train_test_split(X_normal, y_normal, train_size=0.8, random_state=42)


In [None]:
# Combine the training and test sets
X_train = pd.concat([X_train_attack, X_train_normal])
y_train = pd.concat([y_train_attack, y_train_normal])
X_test = pd.concat([X_test_attack, X_test_normal])
y_test = pd.concat([y_test_attack, y_test_normal])

In [None]:
# Shuffle the combined training data
X_train = X_train.sample(frac=1, random_state=42).reset_index(drop=True)
y_train = y_train.sample(frac=1, random_state=42).reset_index(drop=True)