In [1]:
# Load the libraries
import pandas as pd
import numpy  as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import warnings
import platform
import sys
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.simplefilter("ignore")

# Making the plots standard 
%matplotlib inline
plt.rcParams["figure.figsize"] = [16, 5]

In [2]:
#Hardware and software details 
print(f"Python Platform: {platform.platform()}")
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")

Python Platform: macOS-14.2.1-arm64-arm-64bit
Python 3.10.11 (main, Apr  7 2023, 07:24:53) [Clang 14.0.0 (clang-1400.0.29.202)]
Pandas 2.0.1
Scikit-Learn 1.4.0


In [3]:
# import the pickled dataframes you will be using for the experiments
latency_df = pd.read_pickle('latency_withprobe.pickle')

#remove the null values from the experiment_df 
nan_rows = latency_df[latency_df['last_rtt'].isnull()]
latency_df = latency_df.dropna(subset=['last_rtt'])

len(nan_rows)

0

In [4]:
#remove indexes where last_rtt is greater than 100ms 
latency_df = latency_df[latency_df['last_rtt'] < 100]
len(latency_df)

1238890

In [5]:
model_df = latency_df.copy()
model_df.columns

Index(['fw', 'lts', 'proto', 'af', 'size', 'paris_id', 'prb_id', 'msm_name',
       'destination_ip_responded', 'dst_id', 'src_names', 'distance',
       'source_longitude', 'source_latitude', 'destination_longitude',
       'destination_latitude', 'date', 'last_rtt', 'hop_count', 'ASN_source',
       'CountryCode_source', 'source_status', 'Anchor_source',
       'Latitude_source', 'Longitude_source', 'Public_source',
       'Uptime(days)_source', 'ASN_destination', 'CountryCode_destination',
       'destination_status', 'Anchor_destination', 'Latitude_destination',
       'Longitude_destination', 'Public_destination',
       'Uptime(days)_destination', 'day_of_week', 'hour_of_day',
       'minute_of_hour', 'source_status_change(days)',
       'destination_status_change(days)', 'norm_timestamp',
       'norm_storedtimestamp', 'src_Octet1', 'src_Octet2', 'src_Octet3',
       'src_Octet4', 'src_Mask', 'dst_Octet1', 'dst_Octet2', 'dst_Octet3',
       'dst_Octet4', 'dst_Mask'],
      dtype

In [6]:
#Before applying models randomise selection of training and test data
import itertools
import random

test_indices = []
train_indices = []
        
array1 = model_df['prb_id'].unique()
array2 = model_df['dst_id'].unique()

# Creating all possible pairs
pairs = list(itertools.product(array1, array2))

# Randomly selecting 10 pairs
selected_pairs = random.sample(pairs, 10)

# Removing selected pairs from the original list
for pair in selected_pairs:
    pairs.remove(pair)

# Creating separate lists
selected_list = selected_pairs
remaining_list = pairs

train_dfs = []
for i,k in remaining_list:
    temp_df = model_df.loc[(model_df['prb_id'] == i) & (model_df['dst_id'] == k)]
            
    # Append the piece to the selected data
    train_dfs.append(temp_df)
            
    train_df = pd.concat(train_dfs)
        
test_dfs = []
for i,k in selected_list:
    temp_df = model_df.loc[(model_df['prb_id'] == i) & (model_df['dst_id'] == k)]
            
    # Append the piece to the selected data
    test_dfs.append(temp_df)

    test_df = pd.concat(train_dfs)

In [7]:
selected_pairs

[(15118, 15632),
 (55787, 15632),
 (51265, 1004200),
 (51265, 30381),
 (55787, 14866),
 (51265, 26072),
 (55787, 1004200),
 (61357, 1004200),
 (51265, 15632),
 (15118, 1004200)]

In [8]:
remaining_list

[(15118, 14866),
 (15118, 30381),
 (15118, 26072),
 (33627, 14866),
 (33627, 1004200),
 (33627, 15632),
 (33627, 30381),
 (33627, 26072),
 (51265, 14866),
 (55787, 30381),
 (55787, 26072),
 (61357, 14866),
 (61357, 15632),
 (61357, 30381),
 (61357, 26072)]

In [9]:
#normalise the rtt values 
trainrtt_mean = train_df['last_rtt'].mean()
trainrtt_std = train_df['last_rtt'].std()
train_df['normalizzed_rtt'] = (train_df['last_rtt'] - trainrtt_mean) / trainrtt_std

# normalize the 'distance' column
traindist_mean = train_df['distance'].mean()
traindist_std = train_df['distance'].std()
train_df['normalizzed_distance'] = (train_df['distance'] - traindist_mean) / traindist_std
        
#normalise the test rtt values 
test_df['normalizzed_rtt'] = (test_df['last_rtt'] - trainrtt_mean) / trainrtt_std

# normalize the 'distance' column for the test set 
test_df['normalizzed_distance'] = (test_df['distance'] - traindist_mean) / traindist_std


In [10]:
#check for nan values in normalised rtt and distance
nan_train = train_df[train_df['normalizzed_rtt'].isnull()]
nan_test = test_df[test_df['normalizzed_rtt'].isnull()]
print(len(nan_train),len(nan_test))

0 0


In [11]:
#pickle the train and test dataframes
train_df.to_pickle('train_df.pickle')
test_df.to_pickle('test_df.pickle')