In [17]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix

# Load the dataset

df = pd.read_csv('CloudWatch_Traffic_Web_Attack.csv')

df.head(10)


Unnamed: 0,bytes_in,bytes_out,creation_time,end_time,src_ip,src_ip_country_code,protocol,response.code,dst_port,dst_ip,rule_names,observation_name,source.meta,source.name,time,detection_types
0,5602,12990,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,147.161.161.82,AE,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
1,30912,18186,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.33.6,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
2,28506,13468,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.212.255,CA,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
3,30546,14278,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,136.226.64.114,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
4,6526,13892,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.240.79,NL,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
5,3906,3488,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,136.226.77.103,CA,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
6,17748,29208,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.26.101,DE,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
7,4767917,291520,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,155.91.45.242,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
8,10538,15514,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.209.4,CA,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
9,9656,6380,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,147.161.131.1,AT,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule


In [18]:

# Data preprocessing
print("Preprocessing the data...")
df['creation_time'] = pd.to_datetime(df['creation_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
df['duration'] = (df['end_time'] - df['creation_time']).dt.total_seconds()
print("Data preprocessing completed.")


Preprocessing the data...
Data preprocessing completed.


In [19]:

# Extract relevant features
print("Extracting relevant features...")
features = ['bytes_in', 'bytes_out', 'duration']
X = df[features].values
print("Feature extraction completed.")


Extracting relevant features...
Feature extraction completed.


In [20]:

# Normalize the features
print("Normalizing the features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Feature normalization completed.")

# Perform anomaly detection using Isolation Forest
print("Performing anomaly detection using Isolation Forest...")
clf = IsolationForest(contamination='auto')
y_pred = clf.fit_predict(X_scaled)
print("Anomaly detection completed.")


Normalizing the features...
Feature normalization completed.
Performing anomaly detection using Isolation Forest...
Anomaly detection completed.


In [21]:

# Identify anomalies
print("Identifying anomalies...")
anomalies = df.loc[y_pred == -1]
print("Anomaly identification completed.")

# Print the anomalies
print("Detected anomalies:")
print(anomalies[['creation_time', 'end_time', 'src_ip', 'dst_ip', 'bytes_in', 'bytes_out', 'duration']])


Identifying anomalies...
Anomaly identification completed.
Detected anomalies:
                creation_time                  end_time          src_ip  \
7   2024-04-25 23:00:00+00:00 2024-04-25 23:10:00+00:00   155.91.45.242   
13  2024-04-25 23:10:00+00:00 2024-04-25 23:20:00+00:00  136.226.64.114   
15  2024-04-25 23:10:00+00:00 2024-04-25 23:20:00+00:00  165.225.240.79   
18  2024-04-25 23:10:00+00:00 2024-04-25 23:20:00+00:00   155.91.45.242   
36  2024-04-25 23:30:00+00:00 2024-04-25 23:40:00+00:00   155.91.45.242   
43  2024-04-25 23:40:00+00:00 2024-04-25 23:50:00+00:00  165.225.240.79   
47  2024-04-25 23:40:00+00:00 2024-04-25 23:50:00+00:00   155.91.45.242   
49  2024-04-25 23:50:00+00:00 2024-04-26 00:00:00+00:00    165.225.33.6   
54  2024-04-25 23:50:00+00:00 2024-04-26 00:00:00+00:00   155.91.45.242   
64  2024-04-26 00:00:00+00:00 2024-04-26 00:10:00+00:00   155.91.45.242   
70  2024-04-26 00:10:00+00:00 2024-04-26 00:20:00+00:00  165.225.240.79   
73  2024-04-26 00:10:

In [22]:

# Evaluate the anomaly detection using a confusion matrix
print("Evaluating the anomaly detection performance...")
y_true = [0 if label == 1 else 1 for label in y_pred]
cm = confusion_matrix(y_true, y_pred)
print("Confusion matrix:")
print(cm)

tn = cm[0, 0]
fp = cm[0, 1]
fn = cm[1, 0]
tp = cm[1, 1]

print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")


Evaluating the anomaly detection performance...
Confusion matrix:
[[  0   0   0]
 [  0   0 239]
 [ 43   0   0]]
True Negatives: 0
False Positives: 0
False Negatives: 0
True Positives: 0


In [23]:

# Visualize the results using Plotly
print("Visualizing the results using Plotly...")
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['creation_time'], y=df['bytes_in'], mode='markers', marker=dict(color=y_pred, colorscale='Viridis')))
fig.add_trace(go.Scatter(x=anomalies['creation_time'], y=anomalies['bytes_in'], mode='markers', marker=dict(color='red', size=10)))
fig.update_layout(title='Anomaly Detection in Network Traffic', xaxis_title='Creation Time', yaxis_title='Bytes In')
fig.show()


Visualizing the results using Plotly...
