## Libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering


## Dataset importing

In [10]:
df = pd.read_csv(r'D:\Project Phase II\Dataset\completedataset.csv',encoding='cp1252')
df

  df = pd.read_csv(r'D:\Project Phase II\Dataset\completedataset.csv',encoding='cp1252')


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.1,18247,149.171.126.4,7662,tcp,FIN,0.119596,4550,68342,31,...,,6,2,2,5,1,1,2,,0
1,59.166.0.3,54771,149.171.126.2,27709,tcp,FIN,0.650574,8928,320,31,...,,3,5,2,4,1,1,4,,0
2,59.166.0.8,13289,149.171.126.9,5190,tcp,FIN,0.007980,2158,2464,31,...,,3,5,1,1,1,1,3,,0
3,149.171.126.18,1043,175.45.176.3,53,udp,INT,0.000005,264,0,60,...,,19,19,19,19,19,19,19,,0
4,149.171.126.18,1043,175.45.176.3,53,udp,INT,0.000005,264,0,60,...,,19,19,19,19,19,19,19,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540042,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320,1828,31,...,,1,2,3,3,1,1,3,,0
2540043,59.166.0.7,20848,149.171.126.4,21,tcp,CON,0.365058,456,346,31,...,2,2,2,2,2,2,2,2,,0
2540044,59.166.0.3,21511,149.171.126.9,21,tcp,CON,6.335154,1802,2088,31,...,2,2,2,4,2,2,2,2,,0
2540045,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498,166054,31,...,,1,1,2,4,2,2,2,,0


## Cleaning dataset

Drop is_sm_ips_ports

In [11]:
#Cleaning df by removing rows with is_sm_ips_ports = 1
df = df[df['is_sm_ips_ports']==0]
#Dropping is_sm_ips_ports column
df.drop(['is_sm_ips_ports'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['is_sm_ips_ports'], axis=1, inplace=True)


In [4]:
#Find out unique values in df['attack_cat']
df['attack_cat'].unique()

array([nan, 'Exploits', ' Fuzzers ', 'DoS', 'Generic', ' Shellcode ',
       'Backdoor', ' Reconnaissance ', 'Worms', 'Analysis',
       'Reconnaissance', 'Shellcode', ' Fuzzers', 'Backdoors'],
      dtype=object)

In [12]:
#Remove trailing white spaces from the dataset
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [6]:
df['attack_cat'].unique()

array([nan, 'Exploits', 'Fuzzers', 'DoS', 'Generic', 'Shellcode',
       'Backdoor', 'Reconnaissance', 'Worms', 'Analysis', 'Backdoors'],
      dtype=object)

In [13]:
#Replace 'Backdoor' with 'Backdoors'
df['attack_cat'] = df['attack_cat'].replace('Backdoor', 'Backdoors')

In [14]:
#Fill NaN values with 0 for int and float columns and None for object columns
for col in df.columns:
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        df[col] = df[col].fillna(0)
    elif df[col].dtype == 'object':
        df[col] = df[col].fillna('None')

In [15]:
df['attack_cat'].unique()

array(['None', 'Exploits', 'Fuzzers', 'DoS', 'Generic', 'Shellcode',
       'Backdoors', 'Reconnaissance', 'Worms', 'Analysis'], dtype=object)

In [20]:
#Replace None in sport and dport with 0
df['sport'] = df['sport'].replace('None', '0')
df['dsport'] = df['dsport'].replace('None', '0')

In [38]:
df.is_ftp_login.unique()

array([0, 1], dtype=int64)

In [34]:
#Convert is_ftp_login to int
df['is_ftp_login'] = df['is_ftp_login'].astype(int)

In [37]:
#Convert all is_ftp_login values > 0 to 1
df['is_ftp_login'] = df['is_ftp_login'].apply(lambda x: 1 if x > 0 else x)

In [40]:
df.ct_dst_src_ltm .unique()

array([ 2,  4,  3, 19,  1,  5,  7,  6, 25, 24, 10,  9, 18, 22, 17, 32, 54,
       15, 33, 48, 40, 37, 16, 42, 34, 39, 31, 36, 11, 30, 21, 27, 29, 35,
       43, 59, 23, 47, 20, 14, 38, 56, 45,  8, 51, 26, 41, 28, 49, 12, 46,
       13, 44, 53, 50, 52, 57, 58, 55, 60, 66, 61, 65, 67, 63],
      dtype=int64)

In [41]:
#Convert to csv and store in a new file
df.to_csv(r'D:\Project Phase II\Dataset\phase2cleaneddataset.csv', index=False)

In [11]:
df.drop(['attack_cat'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['attack_cat'],axis=1,inplace=True)


## Label Encoding

In [42]:
#label encoding for non numerical features
labelencoder = LabelEncoder()
from sklearn import preprocessing 

# label_encoder object knows  
# how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

df['proto'] = labelencoder.fit_transform(df['proto'])
df['service'] = labelencoder.fit_transform(df['service'])
df['state'] = labelencoder.fit_transform(df['state'])
df['attack_cat'] = labelencoder.fit_transform(df['attack_cat'])



In [5]:
df2 = pd.read_csv(r'D:\Project Phase II\Dataset\phase2cleaneddataset.csv',encoding='cp1252')
df2

  df2 = pd.read_csv(r'D:\Project Phase II\Dataset\phase2cleaneddataset.csv',encoding='cp1252')


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.1,0,149.171.126.4,7662,tcp,FIN,0.119596,4550,68342,31,...,,6,2,2,5,1,1,2,,0
1,59.166.0.3,0,149.171.126.2,27709,tcp,FIN,0.650574,8928,320,31,...,,3,5,2,4,1,1,4,,0
2,59.166.0.8,0,149.171.126.9,5190,tcp,FIN,0.007980,2158,2464,31,...,,3,5,1,1,1,1,3,,0
3,149.171.126.18,0,175.45.176.3,53,udp,INT,0.000005,264,0,60,...,,19,19,19,19,19,19,19,,0
4,149.171.126.18,0,175.45.176.3,53,udp,INT,0.000005,264,0,60,...,,19,19,19,19,19,19,19,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2535847,59.166.0.5,0,149.171.126.7,0,tcp,FIN,0.087306,320,1828,31,...,,1,2,3,3,1,1,3,,0
2535848,59.166.0.7,0,149.171.126.4,0,tcp,CON,0.365058,456,346,31,...,2.0,2,2,2,2,2,2,2,,0
2535849,59.166.0.3,0,149.171.126.9,0,tcp,CON,6.335154,1802,2088,31,...,2.0,2,2,4,2,2,2,2,,0
2535850,59.166.0.9,0,149.171.126.0,0,tcp,CON,2.200934,3498,166054,31,...,,1,1,2,4,2,2,2,,0


In [6]:

#Replace '' and 'None' with 0 in ct_ftp_cmd
df2['ct_ftp_cmd'] = df2['ct_ftp_cmd'].replace('None', '0')
df2['ct_ftp_cmd'] = df2['ct_ftp_cmd'].replace('', '0')

In [8]:
df2.ct_ftp_cmd.fillna(0, inplace=True)

In [9]:
df2['ct_ftp_cmd'] = df2['ct_ftp_cmd'].astype(int)

In [10]:
df2['ct_ftp_cmd'].unique()

array([0, 1, 4, 2, 6])

In [11]:
#Convert to csv and store in a new file
df2.to_csv(r'D:\Project Phase II\Dataset\phase2cleaneddataset.csv', index=False)

In [49]:
#convert ct_ftp_cmd to int
df['ct_ftp_cmd'] = df['ct_ftp_cmd'].astype(int)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2535852 entries, 0 to 2540046
Data columns (total 48 columns):
 #   Column            Dtype  
---  ------            -----  
 0   srcip             object 
 1   sport             object 
 2   dstip             object 
 3   dsport            object 
 4   proto             int32  
 5   state             int32  
 6   dur               float64
 7   sbytes            int64  
 8   dbytes            int64  
 9   sttl              int64  
 10  dttl              int64  
 11  sloss             int64  
 12  dloss             int64  
 13  service           int32  
 14  Sload             float64
 15  Dload             float64
 16  Spkts             int64  
 17  Dpkts             int64  
 18  swin              int64  
 19  dwin              int64  
 20  stcpb             int64  
 21  dtcpb             int64  
 22  smeansz           int64  
 23  dmeansz           int64  
 24  trans_depth       int64  
 25  res_bdy_len       int64  
 26  Sjit          

In [51]:
#Convert to csv and store in a new file
df.to_csv(r'D:\Project Phase II\Dataset\phase2labelencodeddataset.csv', index=False)

Filling in null values

In [5]:
#Fill nullvalues
#Filling Nan values with 0
df.fillna(0, inplace=True)
#Filling '' with 0
df.replace('', 0, inplace=True)
df.replace(' ', 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace('', 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace(' ', 0, inplace=True)


In [12]:
df

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label
0,59.166.0.1,18247,149.171.126.4,7662,114,5,0.119596,4550,68342,31,...,0.0,0,6,2,2,5,1,1,2,0
1,59.166.0.3,54771,149.171.126.2,27709,114,5,0.650574,8928,320,31,...,0.0,0,3,5,2,4,1,1,4,0
2,59.166.0.8,13289,149.171.126.9,5190,114,5,0.007980,2158,2464,31,...,0.0,0,3,5,1,1,1,1,3,0
3,149.171.126.18,1043,175.45.176.3,53,120,6,0.000005,264,0,60,...,0.0,0,19,19,19,19,19,19,19,0
4,149.171.126.18,1043,175.45.176.3,53,120,6,0.000005,264,0,60,...,0.0,0,19,19,19,19,19,19,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540042,59.166.0.5,33094,149.171.126.7,43433,114,5,0.087306,320,1828,31,...,0.0,0,1,2,3,3,1,1,3,0
2540043,59.166.0.7,20848,149.171.126.4,21,114,2,0.365058,456,346,31,...,2.0,2,2,2,2,2,2,2,2,0
2540044,59.166.0.3,21511,149.171.126.9,21,114,2,6.335154,1802,2088,31,...,2.0,2,2,2,4,2,2,2,2,0
2540045,59.166.0.9,35433,149.171.126.0,80,114,2,2.200934,3498,166054,31,...,0.0,0,1,1,2,4,2,2,2,0


Correlation with label

In [14]:
#Correlation with output variable
cor = df.corr()
cor_target = abs(cor["Label"])
cor_target

proto               0.011832
state               0.338312
dur                 0.002055
sbytes              0.010134
dbytes              0.075698
sttl                0.904512
dttl                0.134569
sloss               0.043423
dloss               0.096054
service             0.009492
Sload               0.192121
Dload               0.220163
Spkts               0.121179
Dpkts               0.116195
swin                0.316271
dwin                0.315079
stcpb               0.234160
dtcpb               0.234252
smeansz             0.065710
dmeansz             0.273102
trans_depth         0.029195
res_bdy_len         0.027398
Sjit                0.020903
Djit                0.054562
Stime               0.275647
Ltime               0.275647
Sintpkt             0.012086
Dintpkt             0.010668
tcprtt              0.143044
synack              0.122190
ackdat              0.143386
ct_state_ttl        0.880216
ct_flw_http_mthd    0.026690
is_ftp_login        0.031908
ct_srv_src    

In [15]:
sorted(cor_target,reverse=True)

[1.0,
 0.9045123713273255,
 0.8802159911646181,
 0.4397582040181459,
 0.41894609595431975,
 0.3966171141155974,
 0.3862785180682943,
 0.38276987066060003,
 0.34300251941699716,
 0.33923406199653117,
 0.3383115509958634,
 0.31627126136549794,
 0.31507938791330836,
 0.2756467745810105,
 0.2756467361934781,
 0.2731022353548177,
 0.23425165967570538,
 0.23415972437915167,
 0.22016301877421282,
 0.19212078068924934,
 0.14338606748651497,
 0.14304379120018665,
 0.1345687229012209,
 0.12218956387082323,
 0.12117868665702373,
 0.11619454701110364,
 0.09605449315010073,
 0.07569769679289255,
 0.06571002635878419,
 0.054562098246944746,
 0.043423355305197676,
 0.03190817000838083,
 0.02919529475181047,
 0.02739821746445534,
 0.02668985508780704,
 0.020902958909795925,
 0.01208573079157584,
 0.011832106432080278,
 0.010667773256950744,
 0.010133696212686666,
 0.009492262382872188,
 0.002055001885478537]

In [None]:
#Information Gain of features
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

X = df.iloc[:,0:42]  #independent columns
y = df.iloc[:,-1]    #target column i.e Label


In [16]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid
from torch_geometric.data import DataLoader

In [18]:
#SAmple the dataset
# Randomly sample 1000 rows
dfsample = df.sample(n=1000, random_state=42)

# Extract features and labels
src_encoder = LabelEncoder()
x = src_encoder.fit_transform(x)
dst_encoder = LabelEncoder()
y = dst_encoder.fit_transform(data['Dst IP Addr'])

x = torch.tensor(dfsample[['srcip', 'dstip']].values, dtype=torch.long)
y = torch.tensor(dfsample['Label'].values, dtype=torch.long)

features = dfsample.drop(columns=['srcip', 'dstip','sport','dsport', 'Label'])
scaler = StandardScaler()
features = scaler.fit_transform(features)

#Sampling
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Construct edge indices
edge_index = torch.tensor([dfsample['srcip'].values, dfsample['dstip'].values], dtype=torch.long)
edge_attr = torch.tensor(features, dtype=torch.ListType) 

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        return x

# Instantiate the model
model = GCN(input_dim=2, hidden_dim=64, output_dim=2)  # input_dim=2 since we have 2 features (source and destination IPs), output_dim=2 for binary edge classification

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Training
def train_model(model, optimizer, criterion, x_train, edge_index, edge_attr, y_train, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(x_train, edge_index, edge_attr)
        loss = criterion(out, y_train)
        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

# Convert edge labels to one-hot encoding
y_train_onehot = F.one_hot(y_train)

# Train the model
train_model(model, optimizer, criterion, x_train, edge_index, edge_attr, y_train, epochs=10)

TypeError: tensor(): argument 'dtype' must be torch.dtype, not pybind11_type

In [None]:
# Testing
def test_model(model, x_test, edge_index, edge_attr, y_test):
    model.eval()
    with torch.no_grad():
        out = model(x_test, edge_index, edge_attr)
        pred = out.argmax(dim=1)
        f1 = f1_score(y_test, pred, average='macro')
        print(f'F1 Score: {f1}')

# Test the model
test_model(model, x_test, edge_index, edge_attr, y_test)