<a href="https://colab.research.google.com/github/RochesterYin/CSE60749/blob/main/SVR_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Load and pre processing

Credits here to the RON Dataset by MIT - http://nms.csail.mit.edu/ron/data/

To use this dataset in the notebook, you will need to install the zip files from the above mentioned website and copy the latency and bandwidth data files onto your drive under the folder named 'SVR'. After following these steps the notebook should work as intended.
These files are also included in the attachments on Canvas.

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import exp
plt.rcParams["figure.figsize"] = (10, 6)
import csv

# Download the dataset
# I saved my data in a fold named "SVR" in my Google drive
# Then I load the data by the following functions
import zipfile
from google.colab import drive
drive.mount('/content/gdrive')
data_path = "/content/gdrive/My Drive/SVR/"

# Load the data with specified header fields
bw_data = pd.read_csv(data_path+"ron1-bw", delimiter='\t', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE)
bw_data.columns = ["source", "dest", "ron", "Time", "bytes", "Duration", "Dur_32", "Dur_64", "Dur_128"]


# Adding more data to see performance imporvments
bw_data_ron2 = pd.read_csv(data_path+"ron2-bw", delimiter='\t', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE)
bw_data_ron2.columns = ["source", "dest", "ron", "Time", "bytes", "Duration", "Dur_32", "Dur_64", "Dur_128"]

# Combine the data from ron1-bw and ron2-bw
bw_data = pd.concat([bw_data, bw_data_ron2], ignore_index=True)

bw_data.head(10)

Mounted at /content/gdrive


Unnamed: 0,source,dest,ron,Time,bytes,Duration,Dur_32,Dur_64,Dur_128
0,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347
1,204.168.181.39,128.84.154.59,0,985276771,1048576,1.57,0.161472,0.216169,0.306361
2,204.168.181.39,128.84.154.59,1,985276772,1048576,1.602,0.155499,0.199846,0.329906
3,155.101.132.8,208.246.45.8,0,985277336,1048576,10.508,0.774487,1.42849,2.25999
4,155.101.132.8,208.246.45.8,1,985277347,1048576,11.625,0.656028,1.20546,1.89997
5,155.101.132.8,208.246.45.8,0,985277359,1048576,10.512,0.7755,1.43,2.26952
6,155.101.132.8,208.246.45.8,1,985277369,1048576,9.924,0.761437,1.40844,2.17494
7,128.2.181.105,128.84.154.59,0,985277414,1048576,1.587,0.19776,0.264579,0.356296
8,128.2.181.105,128.84.154.59,1,985277415,1048576,1.577,0.191822,0.241681,0.34573
9,128.2.181.105,128.84.154.59,0,985277417,1048576,1.566,0.19506,0.249121,0.350314


In [None]:
# Load the latency data with specified header fields
lat_data = pd.read_csv(data_path+"ron1-latency", delimiter='\t', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE)
lat_data.columns = ["source", "dest", "ron", "send1_time", "rec1_time", "send2_time", "rec2_time"]

# Add more data to imporve performance
lat_data_ron2 = pd.read_csv(data_path+"ron2-latency", delimiter='\t', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE)
lat_data_ron2.columns = ["source", "dest", "ron", "send1_time", "rec1_time", "send2_time", "rec2_time"]

# Combine the data from ron1-latency and ron2-latency
lat_data = pd.concat([lat_data, lat_data_ron2], ignore_index=True)

lat_data.head(10)

Unnamed: 0,source,dest,ron,send1_time,rec1_time,send2_time,rec2_time
0,18.31.0.144,128.2.181.105,2,985164100.0,,,
1,204.168.181.39,209.131.33.137,2,985164100.0,,,
2,204.168.181.39,18.31.0.144,1,985164100.0,985164100.0,985164100.0,985164100.0
3,128.84.154.59,128.2.181.105,2,985164100.0,,,
4,204.168.181.39,18.31.0.144,0,985164100.0,985164100.0,985164100.0,985164100.0
5,128.84.154.59,18.31.0.144,1,985164100.0,985164100.0,985164100.0,985164100.0
6,204.168.181.39,18.31.0.144,2,985164100.0,985164100.0,985164100.0,985164100.0
7,18.31.0.144,128.84.154.59,2,985164100.0,985164100.0,985164100.0,985164100.0
8,128.84.154.59,18.31.0.144,0,985164100.0,985164100.0,985164100.0,985164100.0
9,130.37.30.16,18.31.0.144,2,985164100.0,985164100.0,985164100.0,985164100.0


In [None]:
# Calculate TCP Throughput in bits per second
bw_data['TCP_Throughput_bps'] = (bw_data['bytes'] * 8) / bw_data['Duration']

# Convert bits per second to Megabits per second (Mbps)
bw_data['TCP_Throughput_Mbps'] = bw_data['TCP_Throughput_bps'] / 1e6

# Alternatively, you can add it as Kbps (Kilobits per second)
# data['TCP_Throughput_Kbps'] = data['TCP_Throughput_bps'] / 1e3
bw_data.head(10)

Unnamed: 0,source,dest,ron,Time,bytes,Duration,Dur_32,Dur_64,Dur_128,TCP_Throughput_bps,TCP_Throughput_Mbps
0,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242
1,204.168.181.39,128.84.154.59,0,985276771,1048576,1.57,0.161472,0.216169,0.306361,5343062.0,5.343062
2,204.168.181.39,128.84.154.59,1,985276772,1048576,1.602,0.155499,0.199846,0.329906,5236335.0,5.236335
3,155.101.132.8,208.246.45.8,0,985277336,1048576,10.508,0.774487,1.42849,2.25999,798306.8,0.798307
4,155.101.132.8,208.246.45.8,1,985277347,1048576,11.625,0.656028,1.20546,1.89997,721600.7,0.721601
5,155.101.132.8,208.246.45.8,0,985277359,1048576,10.512,0.7755,1.43,2.26952,798003.0,0.798003
6,155.101.132.8,208.246.45.8,1,985277369,1048576,9.924,0.761437,1.40844,2.17494,845285.0,0.845285
7,128.2.181.105,128.84.154.59,0,985277414,1048576,1.587,0.19776,0.264579,0.356296,5285827.0,5.285827
8,128.2.181.105,128.84.154.59,1,985277415,1048576,1.577,0.191822,0.241681,0.34573,5319346.0,5.319346
9,128.2.181.105,128.84.154.59,0,985277417,1048576,1.566,0.19506,0.249121,0.350314,5356710.0,5.35671


In [None]:
# Analysing some ron values to figure out strategies to join the data.
ron_value_counts_bw = bw_data['ron'].value_counts()
ron_value_counts_lat = lat_data['ron'].value_counts()

# Display the counts
print("bw_data:")
print(ron_value_counts_bw)

print("\nlat_data:")
print(ron_value_counts_lat)

bw_data:
1    8216
0    8183
Name: ron, dtype: int64

lat_data:
2    1884993
1    1884783
0    1883943
Name: ron, dtype: int64


In [None]:
import pandas as pd

# Merging in chunks because of computation limitations
# Create an empty DataFrame for the merged data
merged_data = pd.DataFrame()

chunk_size = 10000

# declaring columns used to join the dataframes
# Here 'ron' can be removed if we wanna combine data just based on source and destination
common_columns = ['source', 'dest', 'ron']

# Calculate the number of chunks needed
num_chunks = min(len(bw_data) // chunk_size + 1, len(lat_data) // chunk_size + 1)

# Iterate over chunks of data in both DataFrames
for i in range(num_chunks):
    chunk1 = bw_data.iloc[i * chunk_size: (i + 1) * chunk_size]
    chunk2 = lat_data.iloc[i * chunk_size: (i + 1) * chunk_size]

    # # Filter rows where 'ron' is 0 or 1. We can uncomment this if we need to just select data where ron == 0 or 1
    # filtered_chunk1 = chunk1[chunk1['ron'] == 1]
    # filtered_chunk2 = chunk2[chunk2['ron'] == 1]

    # Merge the filtered chunks based on common columns
    merged_chunk = pd.merge(chunk1, chunk2, on=common_columns, how='inner')

    # Append the merged chunk to the result DataFrame
    merged_data = pd.concat([merged_data, merged_chunk], ignore_index=True)

# The 'merged_data' DataFrame now contains the merged data
merged_data.head(10)


Unnamed: 0,source,dest,ron,Time,bytes,Duration,Dur_32,Dur_64,Dur_128,TCP_Throughput_bps,TCP_Throughput_Mbps,send1_time,rec1_time,send2_time,rec2_time
0,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164100.0,985164100.0,985164100.0,985164100.0
1,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164400.0,985164400.0,985164400.0,985164400.0
2,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164500.0,985164500.0,985164500.0,985164500.0
3,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0
4,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0
5,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0
6,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0
7,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0
8,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0
9,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164700.0,985164700.0,985164700.0,985164700.0


In [None]:
# Calculate round-trip latency
merged_data['round_trip_latency'] = (merged_data['rec1_time'] - merged_data['send1_time']) + (merged_data['rec2_time'] - merged_data['send2_time'])
merged_data.head(10)

Unnamed: 0,source,dest,ron,Time,bytes,Duration,Dur_32,Dur_64,Dur_128,TCP_Throughput_bps,TCP_Throughput_Mbps,send1_time,rec1_time,send2_time,rec2_time,round_trip_latency
0,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164100.0,985164100.0,985164100.0,985164100.0,0.008672
1,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164400.0,985164400.0,985164400.0,985164400.0,0.008965
2,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164500.0,985164500.0,985164500.0,985164500.0,0.008796
3,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0,0.008866
4,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0,0.009149
5,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0,0.008936
6,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0,0.00889
7,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0,0.009042
8,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164600.0,985164600.0,985164600.0,985164600.0,0.009046
9,204.168.181.39,128.84.154.59,1,985276769,1048576,1.953,0.497296,0.578922,0.669347,4295242.0,4.295242,985164700.0,985164700.0,985164700.0,985164700.0,0.009364


In [None]:
# List of columns to keep
columns_to_keep = ['source', 'dest', 'ron', 'bytes', 'Duration', 'TCP_Throughput_Mbps', 'round_trip_latency']

# Create a new DataFrame with only the selected columns
selected_data = merged_data[columns_to_keep]

# Rename the columns
selected_data = selected_data.rename(columns={'ron_x': 'ron_bw', 'ron_y': 'ron_lat'})

# 'selected_data' now contains only the specified columns
selected_data.head(10)

#print details of selected_data
total_rows = selected_data.shape[0]
print("Total number of rows:", total_rows)


# Print data types of each column
print("Data types:")
print(selected_data.dtypes)

# Print memory usage
print("\nMemory usage:")
print(selected_data.memory_usage(deep=True))


Total number of rows: 257811
Data types:
source                  object
dest                    object
ron                      int64
bytes                    int64
Duration               float64
TCP_Throughput_Mbps    float64
round_trip_latency     float64
dtype: object

Memory usage:
Index                       128
source                 18063623
dest                   18062966
ron                     2062488
bytes                   2062488
Duration                2062488
TCP_Throughput_Mbps     2062488
round_trip_latency      2062488
dtype: int64


In [None]:
import pandas as pd

# some more analysis
distinct_ron_values = selected_data['ron'].unique()
ron_value_counts = selected_data['ron'].value_counts()

print("Distinct RON values:", distinct_ron_values)
print("RON value counts:")
print(ron_value_counts)

Distinct RON values: [1 0]
RON value counts:
1    129235
0    128576
Name: ron, dtype: int64


# SVR Model Training


In [None]:
from sklearn.model_selection import train_test_split

X = selected_data.drop('TCP_Throughput_Mbps', axis=1)  # Features
y = selected_data['TCP_Throughput_Mbps']  # Target variable

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40000)

In [None]:
from sklearn.preprocessing import StandardScaler

# Extract numerical columns for scaling
numerical_columns = ['bytes', 'Duration', 'round_trip_latency']

# Use the StandardScaler only on the numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])


In [None]:
# Perform one-hot encoding on categorical features
X_train_encoded = pd.get_dummies(X_train, columns=['source', 'dest'])
X_test_encoded = pd.get_dummies(X_test, columns=['source', 'dest'])

# Removing rows with NaN values from training and testing data as they were causing issues
X_train_encoded = X_train_encoded.dropna()
y_train = y_train[X_train_encoded.index]
X_test_encoded = X_test_encoded.dropna()
y_test = y_test[X_test_encoded.index]

from sklearn.svm import SVR

# Initialize the SVR model
svr_model = SVR(kernel='linear', C=1.0, epsilon=0.1)

# Define the chunk size
chunk_size = 1000

# Loop through your data in chunks
for start in range(0, len(X_train_encoded), chunk_size):
    end = start + chunk_size
    X_chunk = X_train_encoded[start:end]
    y_chunk = y_train[start:end]

    # Train the SVR model on the current chunk
    svr_model.fit(X_chunk, y_chunk)


In [None]:
# Make predictions on the testing set
y_pred = svr_model.predict(X_test_encoded)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)


Mean Absolute Error (MAE): 0.37119629050564723
Mean Squared Error (MSE): 0.7174049968227616
R-squared (R2): 0.4422607493207078


# Ensemble apprach

This approach tries to train two SVR models - one model based on data from ordinary internet traffic and another based on RON optimized paths. The predictions are then averaged between the two models.

In [None]:
import pandas as pd

# Create an empty DataFrame for the merged data
merged_data = pd.DataFrame()

chunk_size = 10000

# merge columns
common_columns = ['source', 'dest']

# Calculate the number of chunks needed
num_chunks = min(len(bw_data) // chunk_size + 1, len(lat_data) // chunk_size + 1)

# Iterate over chunks of data in both DataFrames
for i in range(num_chunks):
    chunk1 = bw_data.iloc[i * chunk_size: (i + 1) * chunk_size]
    chunk2 = lat_data.iloc[i * chunk_size: (i + 1) * chunk_size]

    # Filter rows where 'ron' is 0
    filtered_chunk1 = chunk1[chunk1['ron'] == 0]
    filtered_chunk2 = chunk2[chunk2['ron'] == 0]

    # Merge the filtered chunks based on common columns
    merged_chunk = pd.merge(filtered_chunk1, filtered_chunk2, on=common_columns, how='inner')

    # Append the merged chunk to the result DataFrame
    df1_merged = pd.concat([merged_data, merged_chunk], ignore_index=True)

# The 'merged_data' DataFrame now contains the merged data where 'ron' is 0 in both datasets
df1_merged.head(10)


Unnamed: 0,source,dest,ron_x,Time,bytes,Duration,Dur_32,Dur_64,Dur_128,TCP_Throughput_bps,TCP_Throughput_Mbps,ron_y,send1_time,rec1_time,send2_time,rec2_time
0,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165500.0,985165500.0,985165500.0,985165500.0
1,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165500.0,985165500.0,985165500.0,985165500.0
2,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0
3,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0
4,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0
5,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0
6,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165700.0,985165700.0,985165700.0,985165700.0
7,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165700.0,985165700.0,985165700.0,985165700.0
8,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165700.0,985165700.0,985165700.0,985165700.0
9,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165800.0,985165800.0,985165800.0,985165800.0


In [None]:
import pandas as pd

# Create an empty DataFrame for the merged data
merged_data = pd.DataFrame()

chunk_size = 10000

# merge columns
common_columns = ['source', 'dest']

# Calculate the number of chunks needed
num_chunks = min(len(bw_data) // chunk_size + 1, len(lat_data) // chunk_size + 1)

# Iterate over chunks of data in both DataFrames
for i in range(num_chunks):
    chunk1 = bw_data.iloc[i * chunk_size: (i + 1) * chunk_size]
    chunk2 = lat_data.iloc[i * chunk_size: (i + 1) * chunk_size]

    # Filter rows where 'ron' is 0
    filtered_chunk1 = chunk1[chunk1['ron'] == 1]
    filtered_chunk2 = chunk2[chunk2['ron'] == 1]

    # Merge the filtered chunks based on common columns
    merged_chunk = pd.merge(filtered_chunk1, filtered_chunk2, on=common_columns, how='inner')

    # Append the merged chunk to the result DataFrame
    df2_merged = pd.concat([merged_data, merged_chunk], ignore_index=True)

# The 'merged_data' DataFrame now contains the merged data where 'ron' is 0 in both datasets
df2_merged.head(10)


Unnamed: 0,source,dest,ron_x,Time,bytes,Duration,Dur_32,Dur_64,Dur_128,TCP_Throughput_bps,TCP_Throughput_Mbps,ron_y,send1_time,rec1_time,send2_time,rec2_time
0,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165400.0,985165400.0,985165400.0,985165400.0
1,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165600.0,985165600.0,985165600.0,985165600.0
2,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165600.0,985165600.0,985165600.0,985165600.0
3,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165700.0,985165700.0,985165700.0,985165700.0
4,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165800.0,985165800.0,985165800.0,985165800.0
5,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165800.0,985165800.0,985165800.0,985165800.0
6,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165800.0,985165800.0,985165800.0,985165800.0
7,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165800.0,985165800.0,985165800.0,985165800.0
8,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165900.0,985165900.0,985165900.0,985165900.0
9,128.2.181.105,206.197.119.141,1,989435774,1048576,5.077,0.440827,0.664577,1.05987,1652277.0,1.652277,1,985165900.0,985165900.0,985165900.0,985165900.0


In [None]:
# Calculate round-trip latency
df1_merged['round_trip_latency'] = (df1_merged['rec1_time'] - df1_merged['send1_time']) + (df1_merged['rec2_time'] - df1_merged['send2_time'])
df2_merged['round_trip_latency'] = (df2_merged['rec1_time'] - df2_merged['send1_time']) + (df2_merged['rec2_time'] - df2_merged['send2_time'])
# df2.head(10)
df1_merged.head(10)

Unnamed: 0,source,dest,ron_x,Time,bytes,Duration,Dur_32,Dur_64,Dur_128,TCP_Throughput_bps,TCP_Throughput_Mbps,ron_y,send1_time,rec1_time,send2_time,rec2_time,round_trip_latency
0,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165500.0,985165500.0,985165500.0,985165500.0,0.060213
1,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165500.0,985165500.0,985165500.0,985165500.0,0.061272
2,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0,0.059715
3,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0,0.059344
4,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0,0.0641
5,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165600.0,985165600.0,985165600.0,985165600.0,0.059705
6,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165700.0,985165700.0,985165700.0,985165700.0,0.061354
7,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165700.0,985165700.0,985165700.0,985165700.0,0.060385
8,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165700.0,985165700.0,985165700.0,985165700.0,0.060188
9,128.2.181.105,206.197.119.141,0,989435769,1048576,5.154,0.458592,0.685864,1.07193,1627592.0,1.627592,0,985165800.0,985165800.0,985165800.0,985165800.0,0.06914


In [None]:
# List of columns to keep
columns_to_keep = ['source', 'dest', 'ron_x', 'bytes', 'Duration', 'TCP_Throughput_Mbps', 'round_trip_latency']

# Create a new DataFrame with only the selected columns
df1 = df1_merged[columns_to_keep]
df2 = df2_merged[columns_to_keep]

# Rename the columns
df1 = df1.rename(columns={'ron_x': 'ron_type'})
df2 = df2.rename(columns={'ron_x': 'ron_type'})

# 'selected_data' now contains only the specified columns
df1.head(10)

Unnamed: 0,source,dest,ron_type,bytes,Duration,TCP_Throughput_Mbps,round_trip_latency
0,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.060213
1,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.061272
2,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.059715
3,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.059344
4,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.0641
5,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.059705
6,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.061354
7,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.060385
8,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.060188
9,128.2.181.105,206.197.119.141,0,1048576,5.154,1.627592,0.06914


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Training the first SVR model
X = df1.drop('TCP_Throughput_Mbps', axis=1)  # Features
y = df1['TCP_Throughput_Mbps']  # Target variable

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=40000)

# Extract numerical columns for scaling
numerical_columns = ['bytes', 'Duration', 'round_trip_latency']

# Use the StandardScaler only on the numerical columns
scaler = StandardScaler()
X_train1[numerical_columns] = scaler.fit_transform(X_train1[numerical_columns])
X_test1[numerical_columns] = scaler.transform(X_test1[numerical_columns])

# Perform one-hot encoding on categorical features
X_train1_encoded = pd.get_dummies(X_train1, columns=['source', 'dest'])
X_test1_encoded = pd.get_dummies(X_test1, columns=['source', 'dest'])

# Remove rows with NaN values from training and testing data as they were causing issues
X_train1_encoded = X_train1_encoded.dropna()
y_train1 = y_train1[X_train1_encoded.index]
X_test1_encoded = X_test1_encoded.dropna()
y_test1 = y_test1[X_test1_encoded.index]

from sklearn.svm import SVR

# Initialize the SVR model
svr_model1 = SVR(kernel='linear', C=1.0, epsilon=0.1)
chunk_size = 1000

# Loop through your data in chunks
for start in range(0, len(X_train1_encoded), chunk_size):
    end = start + chunk_size
    X_chunk = X_train1_encoded[start:end]
    y_chunk = y_train1[start:end]
    svr_model1.fit(X_chunk, y_chunk)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Training the second SVR model
X = df2.drop('TCP_Throughput_Mbps', axis=1)  # Features
y = df2['TCP_Throughput_Mbps']  # Target variable

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=40000)

# Extract numerical columns for scaling
numerical_columns = ['bytes', 'Duration', 'round_trip_latency']

# Use the StandardScaler only on the numerical columns
scaler = StandardScaler()
X_train2[numerical_columns] = scaler.fit_transform(X_train2[numerical_columns])
X_test2[numerical_columns] = scaler.transform(X_test2[numerical_columns])

# Perform one-hot encoding on categorical features
X_train2_encoded = pd.get_dummies(X_train2, columns=['source', 'dest'])
X_test2_encoded = pd.get_dummies(X_test2, columns=['source', 'dest'])

# Remove rows with NaN values from training and testing data as they were causing issues
X_train2_encoded = X_train2_encoded.dropna()
y_train2 = y_train2[X_train2_encoded.index]
X_test2_encoded = X_test2_encoded.dropna()
y_test2 = y_test2[X_test2_encoded.index]

from sklearn.svm import SVR

# Initialize the SVR model
svr_model2 = SVR(kernel='linear', C=1.0, epsilon=0.1)

# Define the chunk size
chunk_size = 1000

# Loop through your data in chunks
for start in range(0, len(X_train2_encoded), chunk_size):
    end = start + chunk_size
    X_chunk = X_train2_encoded[start:end]
    y_chunk = y_train2[start:end]
    svr_model2.fit(X_chunk, y_chunk)


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Combine the test sets and true labels
X_test_combined = np.concatenate([X_test1_encoded, X_test2_encoded])
y_test_combined = np.concatenate([y_test1, y_test2])

# Make predictions on the combined testing set
y_pred1 = svr_model1.predict(X_test_combined)
y_pred2 = svr_model2.predict(X_test_combined)

weight_model1 = 0.6 # Giving this more weight as it is having more significance
weight_model2 = 0.4

# Create ensemble prediction
ensemble_pred = (weight_model1 * y_pred1) + (weight_model2 * y_pred2)

# Calculate metrics
ensemble_mse = mean_squared_error(ensemble_pred, y_test_combined)
ensemble_mae = mean_absolute_error(ensemble_pred, y_test_combined)
ensemble_r2 = r2_score(y_test_combined, ensemble_pred)

print(f"Ensemble Mean Squared Error: {ensemble_mse}")
print(f"Ensemble Mean Absolute Error: {ensemble_mae}")
print(f"Ensemble R-squared: {ensemble_r2}")




Ensemble Mean Squared Error: 0.5984432697361073
Ensemble Mean Absolute Error: 0.33866780197903346
Ensemble R-squared: 0.47665148513847844


# Future Work


Different methods can be built on top of the existing ensemble method in order to make better predictions such as -  
Ensemble with diverse ML models
SVC models for path classification