In [None]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Generate 1000 rows
n_rows = 1000

# Features
duration_sec = np.random.uniform(1, 20, n_rows)  # Flow duration in seconds
packet_count = np.random.randint(50, 150, n_rows)  # Number of packets
latency_ms = np.random.uniform(10, 100, n_rows)  # Latency in ms
protocol = np.random.choice(['TCP', 'UDP', 'ICMP'], n_rows, p=[0.6, 0.3, 0.1])  # Protocol type
device_type = np.random.choice(['IoT', 'Server', 'Router', 'Workstation'], n_rows, p=[0.4, 0.3, 0.2, 0.1])  # Device type
location = np.random.choice(['Data Center', 'Remote'], n_rows, p=[0.5, 0.5])  # Location
event_type = np.random.choice(['No Event', 'Login Fail', 'Malware Detect', 'Firewall Block'], n_rows, p=[0.7, 0.1, 0.1, 0.1])  # Security event
severity = np.random.choice(['None', 'Low', 'Medium', 'High'], n_rows, p=[0.7, 0.1, 0.1, 0.1])  # Severity level

# Target: bytes_sent, influenced by features
# Base: positive correlation with duration and packets, some noise
bytes_sent = (duration_sec * 300) + (packet_count * 50) + (latency_ms * 10) + np.random.normal(0, 500, n_rows)

# Adjust based on categorical (e.g., TCP higher bytes, High severity spikes)
bytes_sent += np.where(protocol == 'TCP', 1000, 0)
bytes_sent += np.where(protocol == 'UDP', 500, 0)
bytes_sent += np.where(device_type == 'Server', 1500, 0)
bytes_sent += np.where(severity == 'High', 2000, 0)
bytes_sent += np.where(event_type == 'Malware Detect', 1000, 0)

# Add some outliers (5% of rows)
outlier_mask = np.random.choice([True, False], n_rows, p=[0.05, 0.95])
bytes_sent[outlier_mask] *= np.random.uniform(5, 10, sum(outlier_mask))

# Create DataFrame
df = pd.DataFrame({
    'duration_sec': duration_sec,
    'packet_count': packet_count,
    'latency_ms': latency_ms,
    'protocol': protocol,
    'device_type': device_type,
    'location': location,
    'event_type': event_type,
    'severity': severity,
    'bytes_sent': bytes_sent
})

# Save to CSV
df.to_csv('network_traffic_dataset.csv', index=False)
print("Dataset saved as 'network_traffic_dataset.csv' with 1000 rows.")

In [None]:
pwd

In [None]:
# ML

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('network_traffic_dataset.csv')

In [None]:
df[:3]

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.severity.value_counts()

In [None]:
df.severity.fillna(value='None', inplace=True)

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
plt.plot(df['packet_count'])

In [None]:
sns.boxenplot(df['bytes_sent'])

In [None]:
df.columns

In [None]:
plt.scatter(y=df['bytes_sent'], x=df['packet_count'])
plt.xlabel('Packets')
plt.ylabel('Bytes sent')
plt.show()

In [None]:
sns.boxplot(x=df['event_type'],y=df['bytes_sent'], hue=df['protocol'])

In [None]:
# Consider we have done a detailed EDA (univariate and bivariate)

In [None]:
df[:2]

In [None]:
# # Distribution of the Target Variable

In [None]:
sns.distplot(df['bytes_sent'])

In [None]:
sns.boxplot(df.bytes_sent)

In [None]:
# Linear regression

In [None]:
# Assumptions --> 1 --> All x vars should have a linear relationship with Y var

In [None]:
df.columns

In [None]:
df.iloc[:,[0,1,2,8]].corr()

In [None]:
sns.heatmap(df.iloc[:,[0,1,2,8]].corr(), annot=True)

In [None]:
# y = mx+c

In [None]:
df.columns

In [None]:
x = df.iloc[:,:8]

In [None]:
y = df['bytes_sent']

In [None]:
df.columns

In [None]:
df.groupby('protocol').agg({'bytes_sent' : ['mean','min','max','std']})

In [None]:
x=pd.get_dummies(columns=['protocol', 'device_type',
       'location', 'event_type', 'severity'], data=x,drop_first=True)

In [None]:
x[:2]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2, random_state=0)

In [None]:
xtrain[:3]

In [None]:
sns.boxplot(ytest)

In [None]:
xtrain.shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr_model = lr.fit(xtrain,ytrain)

In [None]:
lr_model.coef_

In [None]:
xtrain[:2]

In [None]:
plt.scatter(df['duration_sec'],df['bytes_sent'])

In [None]:
pred = lr_model.predict(xtest)

In [None]:
error = pd.DataFrame({'Actual':ytest,'Predicted':pred})

In [None]:
error.reset_index(drop=True, inplace=True)

In [None]:
plt.figure(figsize=(15,15))
error.plot()

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
error[:5]