In [16]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from torch_geometric.data import Data

In [2]:
data = pd.read_csv('data/ALLFLOWMETER_HIKARI2022.csv', index_col=0)

In [9]:
data.head()

Unnamed: 0,uid,originh,originp,responh,responp,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,...,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,bwd_last_window_size,attack_category,Label
0,Cmu9v81jToQyRF1gbk,184.0.48.168,38164,184.0.48.150,50443,0 days 00:00:00.000060,1,1,0,0,...,0.0,0.0,0.0,0.0,64240,0,64240,0,Benign,0
1,CO21hl3TWkuXTOgajk,184.0.48.169,43068,184.0.48.150,50443,0 days 00:00:00.000083,1,1,0,0,...,0.0,0.0,0.0,0.0,64240,0,64240,0,Benign,0
2,CBLJ6L19FP0MfYX7Oh,184.0.48.124,5678,255.255.255.255,5678,0 days 00:01:59.996602,3,0,3,0,...,59999120.0,119996600.0,59998300.0,1156.846698,0,0,0,0,Benign,0
3,ChTG451zJ7hUYOcqje,184.0.48.124,5678,255.255.255.255,5678,0 days 00:00:59.996909,2,0,2,0,...,59996910.0,59996910.0,59996910.0,0.0,0,0,0,0,Benign,0
4,Cn9y6E2KVxzQbs5wjc,184.0.48.124,5678,255.255.255.255,5678,0 days 00:00:59.992130,2,0,2,0,...,59992130.0,59992130.0,59992130.0,0.0,0,0,0,0,Benign,0


In [3]:
data.columns

Index(['uid', 'originh', 'originp', 'responh', 'responp', 'flow_duration',
       'fwd_pkts_tot', 'bwd_pkts_tot', 'fwd_data_pkts_tot',
       'bwd_data_pkts_tot', 'fwd_pkts_per_sec', 'bwd_pkts_per_sec',
       'flow_pkts_per_sec', 'down_up_ratio', 'fwd_header_size_tot',
       'fwd_header_size_min', 'fwd_header_size_max', 'bwd_header_size_tot',
       'bwd_header_size_min', 'bwd_header_size_max', 'flow_FIN_flag_count',
       'flow_SYN_flag_count', 'flow_RST_flag_count', 'fwd_PSH_flag_count',
       'bwd_PSH_flag_count', 'flow_ACK_flag_count', 'fwd_URG_flag_count',
       'bwd_URG_flag_count', 'flow_CWR_flag_count', 'flow_ECE_flag_count',
       'fwd_pkts_payload.min', 'fwd_pkts_payload.max', 'fwd_pkts_payload.tot',
       'fwd_pkts_payload.avg', 'fwd_pkts_payload.std', 'bwd_pkts_payload.min',
       'bwd_pkts_payload.max', 'bwd_pkts_payload.tot', 'bwd_pkts_payload.avg',
       'bwd_pkts_payload.std', 'flow_pkts_payload.min',
       'flow_pkts_payload.max', 'flow_pkts_payload.tot',
   

In [4]:
features = ['flow_duration',
       'fwd_pkts_tot', 'bwd_pkts_tot', 'fwd_data_pkts_tot',
       'bwd_data_pkts_tot', 'fwd_pkts_per_sec', 'bwd_pkts_per_sec',
       'flow_pkts_per_sec', 'down_up_ratio', 'fwd_header_size_tot',
       'fwd_header_size_min', 'fwd_header_size_max', 'bwd_header_size_tot',
       'bwd_header_size_min', 'bwd_header_size_max', 'flow_FIN_flag_count',
       'flow_SYN_flag_count', 'flow_RST_flag_count', 'fwd_PSH_flag_count',
       'bwd_PSH_flag_count', 'flow_ACK_flag_count', 'fwd_URG_flag_count',
       'bwd_URG_flag_count', 'flow_CWR_flag_count', 'flow_ECE_flag_count',
       'fwd_pkts_payload.min', 'fwd_pkts_payload.max', 'fwd_pkts_payload.tot',
       'fwd_pkts_payload.avg', 'fwd_pkts_payload.std', 'bwd_pkts_payload.min',
       'bwd_pkts_payload.max', 'bwd_pkts_payload.tot', 'bwd_pkts_payload.avg',
       'bwd_pkts_payload.std', 'flow_pkts_payload.min',
       'flow_pkts_payload.max', 'flow_pkts_payload.tot',
       'flow_pkts_payload.avg', 'flow_pkts_payload.std', 'fwd_iat.min',
       'fwd_iat.max', 'fwd_iat.tot', 'fwd_iat.avg', 'fwd_iat.std',
       'bwd_iat.min', 'bwd_iat.max', 'bwd_iat.tot', 'bwd_iat.avg',
       'bwd_iat.std', 'flow_iat.min', 'flow_iat.max', 'flow_iat.tot',
       'flow_iat.avg', 'flow_iat.std', 'payload_bytes_per_second',
       'fwd_subflow_pkts', 'bwd_subflow_pkts', 'fwd_subflow_bytes',
       'bwd_subflow_bytes', 'fwd_bulk_bytes', 'bwd_bulk_bytes',
       'fwd_bulk_packets', 'bwd_bulk_packets', 'fwd_bulk_rate',
       'bwd_bulk_rate', 'active.min', 'active.max', 'active.tot', 'active.avg',
       'active.std', 'idle.min', 'idle.max', 'idle.tot', 'idle.avg',
       'idle.std', 'fwd_init_window_size', 'bwd_init_window_size',
       'fwd_last_window_size', 'bwd_last_window_size']

In [5]:
len(features)

80

In [6]:
unique_originh = list(set(data['originh']))
unique_responh = list(set(data['responh']))
unique_all = list(set(unique_originh + unique_responh))

In [12]:
node_ip_to_id = {}
for i, ip in enumerate(unique_all):
    node_ip_to_id[ip] = i


In [17]:
# sources = [node_ip_to_id[x] for x in data['originh'].values]
# targets = [node_ip_to_id[x] for x in data['responh'].values]
# G.add_edges_from([(s, t) for s, t in zip(sources, targets)])

In [7]:
G = nx.Graph()

In [8]:
for i, row in tqdm(data.iterrows(), total=len(data)):
    G.add_edge(row['originh'], row['responh'])

100%|████████████████████████████████████████████████████████████████████████| 228253/228253 [00:11<00:00, 19512.07it/s]


In [9]:
sample_node = np.random.choice(unique_originh, 1)[0]

In [21]:
sample_node

'36.72.215.210'

In [27]:
node_ip_to_id[sample_node]

1191

In [10]:
hop = 5
nodes = [sample_node]
frontier = [sample_node]
visited = [sample_node]
for h in range(hop):
    new_frontier = []
    for f in frontier:
        neighbours = [x for x in list(G.adj[f]) if x not in visited][:5]
        if len(neighbours) == 0:
            continue
        nodes += neighbours
        new_frontier += neighbours
        visited.append(f)
    frontier = new_frontier
        

In [11]:
nodes

['36.72.215.210',
 '184.0.48.169',
 '184.0.48.150',
 '140.213.164.207',
 '103.247.15.246',
 '119.2.52.53',
 '182.2.40.15',
 '184.0.48.168',
 '184.0.48.53',
 '184.0.48.171',
 '184.0.48.27',
 '184.0.48.23',
 '172.17.0.2',
 '172.18.0.6',
 '185.199.108.133',
 '185.199.110.133',
 '185.199.109.133']

In [12]:
G.subgraph(nodes).edges

EdgeView([('184.0.48.53', '184.0.48.150'), ('36.72.215.210', '184.0.48.169'), ('184.0.48.150', '184.0.48.168'), ('184.0.48.150', '184.0.48.169'), ('184.0.48.150', '184.0.48.171'), ('184.0.48.150', '184.0.48.27'), ('184.0.48.150', '184.0.48.23'), ('184.0.48.150', '185.199.108.133'), ('184.0.48.150', '185.199.110.133'), ('184.0.48.150', '185.199.109.133'), ('185.199.109.133', '172.17.0.2'), ('184.0.48.168', '172.17.0.2'), ('184.0.48.168', '184.0.48.169'), ('103.247.15.246', '184.0.48.169'), ('140.213.164.207', '184.0.48.169'), ('182.2.40.15', '184.0.48.169'), ('119.2.52.53', '184.0.48.169'), ('184.0.48.169', '184.0.48.171'), ('172.18.0.6', '184.0.48.171'), ('185.199.108.133', '172.17.0.2'), ('185.199.110.133', '172.17.0.2')])

In [13]:
subgraph = G.subgraph(nodes)

In [15]:
subgraph.nodes

NodeView(('184.0.48.53', '36.72.215.210', '184.0.48.150', '185.199.109.133', '184.0.48.168', '184.0.48.23', '103.247.15.246', '140.213.164.207', '182.2.40.15', '184.0.48.27', '119.2.52.53', '184.0.48.169', '172.18.0.6', '185.199.108.133', '185.199.110.133', '184.0.48.171', '172.17.0.2'))

In [54]:
for s, t in subgraph.edges:
    temp_df = data[(data['originh'] == s) & (data['responh'] ==t)]
    temp_features = temp_df[features]
    if 'flow_duration' in features:
        temp_features['flow_duration'] = temp_features['flow_duration'].apply(lambda x: flow_duration_to_float(x))
    # pandas.DataFrame.sample
    break

In [28]:
np.unique(data['attack_category'])

array(['Benign', 'Bruteforce', 'Bruteforce-XML', 'XMRIGCC CryptoMiner'],
      dtype=object)

In [33]:
data.attack_category.value_counts()

Benign                 214904
XMRIGCC CryptoMiner      7595
Bruteforce-XML           3650
Bruteforce               2104
Name: attack_category, dtype: int64

In [44]:
temp_df[features].iloc[0]['flow_duration']

6.1e-05

In [45]:
def flow_duration_to_float(flow):
    days = float(flow.split('days', 1)[0].strip())
    hh = days * 24
    remain = flow.split('days', 1)[1]
    hh += float(remain.split(':', 1)[0].strip())
    mm = hh * 60
    remain = remain.split(':', 1)[1]
    mm += float(remain.split(':', 1)[0].strip())
    remain = remain.split(':', 1)[1]
    ss = mm * 60
    ss += float(remain)
    return ss

In [56]:
temp_features['flow_duration'] = temp_features['flow_duration'].apply(lambda x: flow_duration_to_float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_features['flow_duration'] = temp_features['flow_duration'].apply(lambda x: flow_duration_to_float(x))
