## Graph Construction from Data:

In [2]:
import pandas as pd
import networkx as nx
import pickle

# File path
file_path = 'Data/02-20-2018.csv'

# Chunk size (adjust based on available RAM)
chunk_size = 1000000

# Create an empty directed graph
G = nx.DiGraph()

# Stream CSV in chunks
for chunk in pd.read_csv(file_path, usecols=["Src IP", "Dst IP", "TotLen Fwd Pkts", "TotLen Bwd Pkts", "Label"], 
                         chunksize=chunk_size, low_memory=False):

    # Drop rows with missing IPs
    chunk.dropna(subset=["Src IP", "Dst IP"], inplace=True)

    # Iterate over rows
    for _, row in chunk.iterrows():
        try:
            src = row["Src IP"]
            dst = row["Dst IP"]
            fwd_bytes = float(row["TotLen Fwd Pkts"])
            bwd_bytes = float(row["TotLen Bwd Pkts"])
            
            # Forward direction: src -> dst
            if G.has_edge(src, dst):
                G[src][dst]["weight"] += fwd_bytes
            else:
                G.add_edge(src, dst, weight=fwd_bytes)
            
            # Backward direction: dst -> src
            if G.has_edge(dst, src):
                G[dst][src]["weight"] += bwd_bytes
            else:
                G.add_edge(dst, src, weight=bwd_bytes)

        except (KeyError, ValueError):
            continue  # Skip problematic rows

# Graph stats
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# Save the graph
with open("network_graph.pkl", "wb") as f:
    pickle.dump(G, f)

# Optional: Save a lightweight graph summary
summary = {
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
}
with open("graph_summary.txt", "w") as f:
    f.write(str(summary))


Number of nodes: 33176
Number of edges: 804322


### Computing egonet features

In [2]:
import pandas as pd
import networkx as nx
import pickle

In [3]:
with open("network_graph.pkl", "rb") as f:
    G = pickle.load(f)

In [None]:
import numpy as np

egonet_features = {}

for node in G.nodes():
    neighbors = list(G.successors(node)) + list(G.predecessors(node))
    egonet_nodes = set(neighbors + [node])
    subgraph = G.subgraph(egonet_nodes).copy()

    N_i = len(subgraph.nodes)
    E_i = len(subgraph.edges)
    W_i = sum([d["weight"] for u, v, d in subgraph.edges(data=True)])
    
    # Weighted adjacency matrix and its top eigenvalue
    W_matrix = nx.to_numpy_array(subgraph, weight="weight")
    if W_matrix.shape[0] > 1:
        lambda_w = max(np.linalg.eigvals(W_matrix)).real
    else:
        lambda_w = 0

    # Store features
    egonet_features[node] = {
        "N_i": N_i,
        "E_i": E_i,
        "W_i": W_i,
        "lambda_w": lambda_w,
    }

# Save the dictionary
with open("egonet_features.pkl", "wb") as f:
    pickle.dump(egonet_features, f)



### Fitting power laws

In [None]:
from scipy.optimize import curve_fit

def power_law(x, C, alpha):
    return C * np.power(x, alpha)

def fit_powerlaw(x_vals, y_vals):
    x = np.array(x_vals)
    y = np.array(y_vals)
    valid = (x > 2) & (y > 0)
    x = x[valid]
    y = y[valid]
    popt, _ = curve_fit(power_law, x, y)
    return popt  # returns C, alpha

def outlier_score(y, y_hat):
    return max(y, y_hat) / min(y, y_hat) * np.log(abs(y - y_hat) + 1)


### Counting outlier scores

In [None]:

Ni = [v["N_i"] for v in egonet_features.values()]
Ei = [v["E_i"] for v in egonet_features.values()]
C_edpl, alpha_edpl = fit_powerlaw(Ni, Ei)

scores_edpl = {}
for node, feats in egonet_features.items():
    x = feats["N_i"]
    y = feats["E_i"]
    y_hat = power_law(x, C_edpl, alpha_edpl)
    score = outlier_score(y, y_hat)
    scores_edpl[node] = score


# Save the dictionary
with open("score_edpl.pkl", "wb") as f:
    pickle.dump(scores_edpl, f)



In [None]:
# Extract values
E_vals = [v["E_i"] for v in egonet_features.values()]
W_vals = [v["W_i"] for v in egonet_features.values()]

# Fit power law
C_ewpl, alpha_ewpl = fit_powerlaw(E_vals, W_vals)

# Compute outlier scores
scores_ewpl = {}
for node, feats in egonet_features.items():
    x = feats["E_i"]
    y = feats["W_i"]
    y_hat = power_law(x, C_ewpl, alpha_ewpl)
    score = outlier_score(y, y_hat)
    scores_ewpl[node] = score

with open("score_ewpl.pkl", "wb") as f:
    pickle.dump(scores_ewpl, f)


  return max(y, y_hat) / min(y, y_hat) * np.log(abs(y - y_hat) + 1)


In [None]:
W_vals = [v["W_i"] for v in egonet_features.values()]
L_vals = [v["lambda_w"] for v in egonet_features.values()]

# Fit power law
C_elwpl, alpha_elwpl = fit_powerlaw(W_vals, L_vals)

# Compute outlier scores
scores_elwpl = {}
for node, feats in egonet_features.items():
    x = feats["W_i"]
    y = feats["lambda_w"]
    y_hat = power_law(x, C_elwpl, alpha_elwpl)
    score = outlier_score(y, y_hat)
    scores_elwpl[node] = score

with open("score_elwpl.pkl", "wb") as f:
    pickle.dump(scores_elwpl, f)


  return max(y, y_hat) / min(y, y_hat) * np.log(abs(y - y_hat) + 1)
  return max(y, y_hat) / min(y, y_hat) * np.log(abs(y - y_hat) + 1)


### Combine with LOF

In [None]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler

# Prepare features for LOF
X_lof = np.array([[v["N_i"], v["E_i"], v["W_i"], v["lambda_w"]] for v in egonet_features.values()])
lof_model = LocalOutlierFactor(n_neighbors=20, metric="euclidean")
lof_scores = -lof_model.fit_predict(X_lof)  # higher means more outlier

# Normalize
def safe_normalize(scores_dict):
    vals = np.array(list(scores_dict.values()))
    vals = np.nan_to_num(vals, nan=0.0, posinf=0.0, neginf=0.0)
    return MinMaxScaler().fit_transform(vals.reshape(-1, 1)).flatten()

edpl_scores_norm = safe_normalize(scores_edpl)
ewpl_scores_norm = safe_normalize(scores_ewpl)
elwpl_scores_norm = safe_normalize(scores_elwpl)
lof_scores_norm = safe_normalize(dict(zip(egonet_features.keys(), lof_scores)))


# Combine
combined_scores = {
    node: edpl_scores_norm[i] + lof_scores_norm[i] + ewpl_scores_norm[i] + elwpl_scores_norm[i]
    for i, node in enumerate(egonet_features.keys())
}




In [13]:
from collections import defaultdict, Counter

ip_labels = defaultdict(list)

for chunk in pd.read_csv(file_path, usecols=["Src IP", "Dst IP", "Label"], chunksize=chunk_size, low_memory=False):
    chunk.dropna(subset=["Src IP", "Dst IP"], inplace=True)
    for _, row in chunk.iterrows():
        ip_labels[row["Src IP"]].append(row["Label"])
        ip_labels[row["Dst IP"]].append(row["Label"])

# Final label per IP: use most common label
ip_majority_label = {ip: Counter(labels).most_common(1)[0][0] for ip, labels in ip_labels.items()}


In [45]:
with open("ip_majority_label.pkl", "wb") as f:
    pickle.dump(ip_majority_label, f)

In [83]:

# Sort and view top 10
sorted_outliers = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
top_outliers = sorted_outliers[:10]
for i, (node, score) in enumerate(top_outliers):
    print(f"{i+1}. Node: {node}, Combined Outlier Score: {score:.4f}")

1. Node: 185.92.73.85, Combined Outlier Score: 2.6585
2. Node: 23.36.32.43, Combined Outlier Score: 2.4592
3. Node: 121.8.141.138, Combined Outlier Score: 2.3405
4. Node: 210.206.216.138, Combined Outlier Score: 2.3405
5. Node: 121.8.141.142, Combined Outlier Score: 2.3405
6. Node: 58.63.230.158, Combined Outlier Score: 2.3405
7. Node: 18.219.32.43, Combined Outlier Score: 2.0740
8. Node: 52.14.136.135, Combined Outlier Score: 2.0700
9. Node: 18.219.9.1, Combined Outlier Score: 2.0695
10. Node: 18.216.200.189, Combined Outlier Score: 2.0678


In [84]:
outlier_nodes = [node for node, _ in sorted_outliers[:17]]  


true_positives = [ip for ip in outlier_nodes if ip_majority_label.get(ip, "Benign") != "Benign"]
false_positives = [ip for ip in outlier_nodes if ip_majority_label.get(ip, "Benign") == "Benign"]

print(f"True Positives: {len(true_positives)}")
print(f"False Positives: {len(false_positives)}")


True Positives: 9
False Positives: 8


In [85]:
# Get all IPs labeled malicious
malicious_ips = {ip for ip, label in ip_majority_label.items() if label != "Benign"}

tp = len(set(outlier_nodes) & malicious_ips)
fp = len(set(outlier_nodes) - malicious_ips)
fn = len(malicious_ips - set(outlier_nodes))

precision = tp / (tp + fp + 1e-6)
recall = tp / (tp + fn + 1e-6)
f1 = 2 * precision * recall / (precision + recall + 1e-6)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Precision: 0.5294
Recall:    0.8182
F1 Score:  0.6429


In [86]:
for ip in top_outliers:
    label = ip_majority_label.get(ip[0], "UNKNOWN")
    print(f"Outlier: {ip[0]} - Label: {label}")


Outlier: 185.92.73.85 - Label: Benign
Outlier: 23.36.32.43 - Label: Benign
Outlier: 121.8.141.138 - Label: Benign
Outlier: 210.206.216.138 - Label: Benign
Outlier: 121.8.141.142 - Label: Benign
Outlier: 58.63.230.158 - Label: Benign
Outlier: 18.219.32.43 - Label: DDoS attacks-LOIC-HTTP
Outlier: 52.14.136.135 - Label: DDoS attacks-LOIC-HTTP
Outlier: 18.219.9.1 - Label: DDoS attacks-LOIC-HTTP
Outlier: 18.216.200.189 - Label: DDoS attacks-LOIC-HTTP
