In [1]:
# In this notebook we add all the Probe data to the dataset
# https://atlas.ripe.net/docs/apis/rest-api-reference/#probes

# Load the libraries
import pandas as pd
import numpy  as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import warnings
import platform
import sys
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.simplefilter("ignore")

# Making the plots standard 
%matplotlib inline
plt.rcParams["figure.figsize"] = [16, 5]

In [2]:
#Hardware and software details 
print(f"Python Platform: {platform.platform()}")
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")

Python Platform: macOS-14.2.1-arm64-arm-64bit
Python 3.10.11 (main, Apr  7 2023, 07:24:53) [Clang 14.0.0 (clang-1400.0.29.202)]
Pandas 2.0.1
Scikit-Learn 1.4.0


##### What are your features ? 
### Traceroute Measurement Result
- **af**: 4 or 6 (integer)
- **dst_addr**: IP address of the destination (string)
- **dst_name**: Name of the destination (string)
- **endtime**: Unix timestamp for the end of measurement (int)
- **from**: IP address of the probe as known by the controller (string)
- **msm_id**: Measurement identifier (int)
- **paris_id**: Variation for the Paris mode of traceroute (int)
- **prb_id**: Source probe ID (int)
- **proto**: "UDP" or "ICMP" (string)
- **result**: List of hop elements (array)

#### Hop Element
- **hop**: Hop number (int)
- **error**: [Optional] Error description when an error occurs trying to send a packet. No result structure in this case. (string)
- **result**: Variable content, depending on the type of response (array)

##### Case: Timeout
- **x**: "*"

##### Case: Reply
- **err**: (Optional) Error ICMP: "N" (network unreachable), "H" (destination unreachable), "A" (administratively prohibited), "P" (protocol unreachable), "p" (port unreachable) (string)
- **from**: IPv4 or IPv6 source address in reply (string)
- **ittl**: (Optional) Time-to-live in the packet that triggered the error ICMP. Omitted if equal to 1 (int)
- **late**: (Optional) Number of packets a reply is late; in this case, RTT is not present (int)
- **mtu**: (Optional) Path MTU from a packet too big ICMP (int)
- **rtt**: Round-trip-time of the reply, not present when the response is late (float)
- **size**: Size of the reply (int)
- **ttl**: Time-to-live in the reply (int)
- **icmpext**: [Optional] Information when an ICMP header is found in the reply (object)
  - **version**: RFC4884 version (int)
  - **rfc4884**: "1" if length indication is present, "0" otherwise (int)
  - **obj**: Elements of the object (array)
  - **class**: RFC4884 class (int)
  - **type**: RFC4884 type (int)

- **size**: Packet size (int)
- **src_addr**: Source address used by the probe (string)
- **timestamp**: Unix timestamp for the start of measurement (int)
- **type**: "traceroute" (string)
- **fw**: Firmware version of the probe
- **mver**: Version of measurement code. Format: "x.y.z" (string)
- **lts**: Last time synchronized. How long ago (in seconds) was the probe’s clock in sync with that of a controller. Value -1 indicates the probe doesn't know if it's in sync (int)
- **msm_name**: Measurement type "Ping" (string)
- **stored_timestamp**: Time when the measurement results were stored or recorded by RIPE Atlas servers.




In [3]:
# import the pickled dataframes you will be using for the experiments
# the experiment_df has the traceroute data with no probe data  - it has the last_rtt,dst_id and distance
# the probe_df has the probe data
latency_df = pd.read_pickle('latency_noprobe.pickle')
probe_df = pd.read_pickle('probe_data.pickle')

In [4]:
#remove the null values from the experiment_df 
nan_rows = latency_df[latency_df['last_rtt'].isnull()]
latency_df = latency_df.dropna(subset=['last_rtt'])

In [5]:
#eliminate rows where last_rtt is greater than 100ms 
latency_df = latency_df[latency_df['last_rtt'] < 100]
latency_df.columns

Index(['fw', 'mver', 'lts', 'endtime', 'dst_name', 'dst_addr', 'src_addr',
       'proto', 'af', 'size', 'paris_id', 'result', 'msm_id', 'prb_id',
       'timestamp', 'msm_name', 'from', 'type', 'group_id', 'stored_timestamp',
       'destination_ip_responded', 'new_time', 'dst_id', 'dst_names',
       'src_names', 'Hour', 'distance', 'hop', 'hop_ip', 'rtt', 'unique_ips',
       'avg_rtt', 'source_longitude', 'source_latitude',
       'destination_longitude', 'destination_latitude', 'date', 'last_rtt'],
      dtype='object')

In [6]:
#add the number of hops to the dataframe
#collect the last hop from each index and add it to the dataframe
latency_df['hop_count'] = latency_df.groupby(latency_df.index)['hop'].transform('last')

In [7]:
q = latency_df[latency_df.index == 2]
q[['hop','hop_count']]

Unnamed: 0,hop,hop_count
2,1,8
2,2,8
2,3,8
2,4,8
2,5,8
2,6,8
2,7,8
2,8,8


In [8]:
#drop the latency columns that are not required for the experiments
#these were added to the dataframe to calculate the last_rtt  - the rtt for the traceoute is in the last_rtt column
latency_df = latency_df.drop(['result','Hour','hop','hop_ip','rtt','avg_rtt','unique_ips','dst_names','type','dst_name'],axis=1)

In [9]:
#add the source probe information to the experiment data 
source_merged_df = pd.merge(latency_df, probe_df, left_on='prb_id', right_on='ProbeID', how='left')
source_merged_df = source_merged_df.rename(columns={'Status': 'source_status'})
source_merged_df.columns

Index(['fw', 'mver', 'lts', 'endtime', 'dst_addr', 'src_addr', 'proto', 'af',
       'size', 'paris_id', 'msm_id', 'prb_id', 'timestamp', 'msm_name', 'from',
       'group_id', 'stored_timestamp', 'destination_ip_responded', 'new_time',
       'dst_id', 'src_names', 'distance', 'source_longitude',
       'source_latitude', 'destination_longitude', 'destination_latitude',
       'date', 'last_rtt', 'hop_count', 'ProbeID', 'ASN', 'CountryCode',
       'IPAddress', 'source_status', 'Anchor', 'Latitude', 'Longitude', 'Tags',
       'Public', 'Since', 'Uptime', 'FirstConnected', 'Prefix_v4',
       'LastConnected', 'Uptime(days)'],
      dtype='object')

In [10]:
# add the destination probe data to the experiment data 
source_merged_df['dst_id'] = source_merged_df['dst_id'].astype('int64')
destination_merged_df = pd.merge(source_merged_df, probe_df, left_on='dst_id', right_on='ProbeID', how='left', suffixes=('_source', '_destination'))
destination_merged_df = destination_merged_df.rename(columns={'Status': 'destination_status'})
destination_merged_df.columns

Index(['fw', 'mver', 'lts', 'endtime', 'dst_addr', 'src_addr', 'proto', 'af',
       'size', 'paris_id', 'msm_id', 'prb_id', 'timestamp', 'msm_name', 'from',
       'group_id', 'stored_timestamp', 'destination_ip_responded', 'new_time',
       'dst_id', 'src_names', 'distance', 'source_longitude',
       'source_latitude', 'destination_longitude', 'destination_latitude',
       'date', 'last_rtt', 'hop_count', 'ProbeID_source', 'ASN_source',
       'CountryCode_source', 'IPAddress_source', 'source_status',
       'Anchor_source', 'Latitude_source', 'Longitude_source', 'Tags_source',
       'Public_source', 'Since_source', 'Uptime_source',
       'FirstConnected_source', 'Prefix_v4_source', 'LastConnected_source',
       'Uptime(days)_source', 'ProbeID_destination', 'ASN_destination',
       'CountryCode_destination', 'IPAddress_destination',
       'destination_status', 'Anchor_destination', 'Latitude_destination',
       'Longitude_destination', 'Tags_destination', 'Public_destination

In [11]:
#what are the data types of the columns in the dataframe
#check the object columns - these will need modification for the experiments
object_columns = destination_merged_df.select_dtypes(include=['object']).columns
df_subset = destination_merged_df[object_columns]
df_subset.columns

Index(['mver', 'dst_addr', 'src_addr', 'proto', 'msm_name', 'from',
       'destination_ip_responded', 'src_names', 'date', 'CountryCode_source',
       'IPAddress_source', 'source_status', 'Tags_source', 'Prefix_v4_source',
       'CountryCode_destination', 'IPAddress_destination',
       'destination_status', 'Tags_destination', 'Prefix_v4_destination'],
      dtype='object')

In [12]:
#check the int columns - these dont need modification 
int_columns = destination_merged_df.select_dtypes(include=['int64']).columns
df_subset2 = destination_merged_df[int_columns]
df_subset2.columns

Index(['fw', 'lts', 'endtime', 'af', 'size', 'paris_id', 'msm_id', 'prb_id',
       'timestamp', 'group_id', 'stored_timestamp', 'dst_id', 'hop_count',
       'ProbeID_source', 'ASN_source', 'Uptime_source', 'ProbeID_destination',
       'ASN_destination', 'Uptime_destination'],
      dtype='object')

In [13]:
#find float data types
float_columns = destination_merged_df.select_dtypes(include=['float']).columns
df_subset3 = destination_merged_df[float_columns]
df_subset3.columns

Index(['distance', 'source_longitude', 'source_latitude',
       'destination_longitude', 'destination_latitude', 'last_rtt',
       'Latitude_source', 'Longitude_source', 'Uptime(days)_source',
       'Latitude_destination', 'Longitude_destination',
       'Uptime(days)_destination'],
      dtype='object')

In [14]:
#find boolean data types
bool_columns = destination_merged_df.select_dtypes(include=['bool']).columns
df_subset4 = destination_merged_df[bool_columns]
df_subset4.columns

Index(['Anchor_source', 'Public_source', 'Anchor_destination',
       'Public_destination'],
      dtype='object')

In [15]:
#find datetime data types
datetime_columns = destination_merged_df.select_dtypes(include=['datetime']).columns
df_subset5 = destination_merged_df[datetime_columns]
df_subset5.columns

Index(['new_time', 'Since_source', 'FirstConnected_source',
       'LastConnected_source', 'Since_destination',
       'FirstConnected_destination', 'LastConnected_destination'],
      dtype='object')

In [16]:
#add day of the week as an integer to the dataframe

destination_merged_df['day_of_week'] = destination_merged_df['new_time'].dt.dayofweek

#add hour of the day as an integer to the dataframe
destination_merged_df['hour_of_day'] = destination_merged_df['new_time'].dt.hour

#add minute of the hour as an integer to the dataframe
destination_merged_df['minute_of_hour'] = destination_merged_df['new_time'].dt.minute

In [17]:
#Handling the date columns
#destination_merged_df[df_subset5.columns]

#drop the first connected, lastconnected columns since we may not need them 
cols_to_drop = ['FirstConnected_source','LastConnected_source','FirstConnected_destination','LastConnected_destination']
destination_merged_df = destination_merged_df.drop(columns=cols_to_drop,axis=1) 

In [18]:
# Find the last value in the 'new_time' column
last_new_time = destination_merged_df['new_time'].iloc[-1]

#subtract the last value from the 'new_time' column to get the time since the last connection status change
destination_merged_df['source_status_change(days)'] = last_new_time - destination_merged_df['Since_source']

#convert this value to days
destination_merged_df['source_status_change(days)'] = destination_merged_df['source_status_change(days)'].dt.total_seconds()/ (24 * 60 * 60)

#drop the 'Since_source' column
destination_merged_df = destination_merged_df.drop(columns=['Since_source'],axis=1)

In [19]:
#subtract the last value from the 'new_time' column to get the time since the last connection status change
destination_merged_df['destination_status_change(days)'] = last_new_time - destination_merged_df['Since_destination']

#convert this value to days
destination_merged_df['destination_status_change(days)'] = destination_merged_df['destination_status_change(days)'].dt.total_seconds()/ (24 * 60 * 60)

#drop the 'Since_destination' column
destination_merged_df = destination_merged_df.drop(columns=['Since_destination'],axis=1)

In [20]:
#drop the 'new_time' column - the same data is in the 'timestamp' column
destination_merged_df = destination_merged_df.drop(columns=['new_time'],axis=1)

In [21]:

# convert the timestamp column to datetime
destination_merged_df['timestamp'] = pd.to_datetime(destination_merged_df['timestamp'])

# Normalise the timestamp column 
initial_timestamp = destination_merged_df['timestamp'].min()

# Create a new column 'normalized_timestamp' with the normalized values
destination_merged_df['norm_timestamp'] = destination_merged_df['timestamp'] - initial_timestamp

#convert the normalized timestamp to seconds
destination_merged_df['norm_timestamp'] = destination_merged_df['norm_timestamp'].dt.total_seconds()

In [22]:

# convert the timestamp column to datetime
destination_merged_df['stored_timestamp'] = pd.to_datetime(destination_merged_df['stored_timestamp'])

# Normalise the timestamp column 
initial_timestamp = destination_merged_df['stored_timestamp'].min()

# Create a new column 'normalized_timestamp' with the normalized values
destination_merged_df['norm_storedtimestamp'] = destination_merged_df['stored_timestamp'] - initial_timestamp

#convert the timestamp to seconds
destination_merged_df['norm_storedtimestamp'] = destination_merged_df['norm_storedtimestamp'].dt.total_seconds()

In [23]:
#drop the 'timestamp' and 'stored_timestamp' columns
destination_merged_df = destination_merged_df.drop(columns=['timestamp','stored_timestamp'],axis=1)

In [24]:
#dealing with the IP columns 
# src_addr,dst_addr,from,IPAddress_source,IPAddress_destination,Prefix_v4_source,Prefix_v4_destination
# we drop all the IP columns and keep the prefix columns
destination_merged_df = destination_merged_df.drop(columns=['src_addr','dst_addr','from','IPAddress_source','IPAddress_destination'],axis=1)

In [25]:
# Function to split IP address into octets and mask
def split_prefix(prefix):
    ip, mask = prefix.split('/')
    octets = ip.split('.')
    return octets + [mask]

In [26]:
# Apply the function to the 'Prefix' column
destination_merged_df[['src_Octet1', 'src_Octet2', 'src_Octet3', 'src_Octet4', 'src_Mask']] = destination_merged_df['Prefix_v4_source'].apply(split_prefix).apply(pd.Series)
# Convert the new columns to integer type
destination_merged_df[['src_Octet1', 'src_Octet2', 'src_Octet3', 'src_Octet4', 'src_Mask']] = destination_merged_df[['src_Octet1', 'src_Octet2', 'src_Octet3', 'src_Octet4', 'src_Mask']].astype(int)

In [27]:
destination_merged_df[['dst_Octet1', 'dst_Octet2', 'dst_Octet3', 'dst_Octet4', 'dst_Mask']] = destination_merged_df['Prefix_v4_destination'].apply(split_prefix).apply(pd.Series)
destination_merged_df[['dst_Octet1', 'dst_Octet2', 'dst_Octet3', 'dst_Octet4', 'dst_Mask']]  = destination_merged_df[['dst_Octet1', 'dst_Octet2', 'dst_Octet3', 'dst_Octet4', 'dst_Mask']] .astype(int)

In [28]:
destination_merged_df = destination_merged_df.drop(columns=['Prefix_v4_source','Prefix_v4_destination'],axis=1)

In [29]:
#Handling the Object datatype 
#drop columns added from the probe dataframe that are not required for the experiments
cols_to_drop = ['Tags_source','Tags_destination']
destination_merged_df = destination_merged_df.drop(columns=cols_to_drop,axis=1) 

In [30]:
#adding labels to categorical features of interest 
from sklearn.preprocessing import LabelEncoder

#create a copy of the latency_df
model_df = destination_merged_df.copy()

categorical_cols = ['mver', 'proto', 'msm_name', 'destination_ip_responded','CountryCode_source','CountryCode_destination','source_status','destination_status']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    model_df[col] = le.fit_transform(model_df[col])
    label_encoders[col] = le

#check the shape of the final dataframe
print(latency_df.shape, model_df.shape)

(1238890, 29) (1238890, 60)


In [31]:
# Access the mapping of labels to numerical values
for col, encoder in label_encoders.items():
    label_mapping = {label: index for index, label in enumerate(encoder.classes_)}
    print(f"Label Mapping for {col}:\n{label_mapping}")

Label Mapping for mver:
{'2.6.2': 0, nan: 1}
Label Mapping for proto:
{'ICMP': 0}
Label Mapping for msm_name:
{'Traceroute': 0}
Label Mapping for destination_ip_responded:
{False: 0, True: 1, nan: 2}
Label Mapping for CountryCode_source:
{'ES': 0}
Label Mapping for CountryCode_destination:
{'ES': 0}
Label Mapping for source_status:
{'Connected': 0, 'Disconnected': 1}
Label Mapping for destination_status:
{'Connected': 0}


In [32]:
len(model_df.columns)

60

In [33]:
model_df.columns

Index(['fw', 'mver', 'lts', 'endtime', 'proto', 'af', 'size', 'paris_id',
       'msm_id', 'prb_id', 'msm_name', 'group_id', 'destination_ip_responded',
       'dst_id', 'src_names', 'distance', 'source_longitude',
       'source_latitude', 'destination_longitude', 'destination_latitude',
       'date', 'last_rtt', 'hop_count', 'ProbeID_source', 'ASN_source',
       'CountryCode_source', 'source_status', 'Anchor_source',
       'Latitude_source', 'Longitude_source', 'Public_source', 'Uptime_source',
       'Uptime(days)_source', 'ProbeID_destination', 'ASN_destination',
       'CountryCode_destination', 'destination_status', 'Anchor_destination',
       'Latitude_destination', 'Longitude_destination', 'Public_destination',
       'Uptime_destination', 'Uptime(days)_destination', 'day_of_week',
       'hour_of_day', 'minute_of_hour', 'source_status_change(days)',
       'destination_status_change(days)', 'norm_timestamp',
       'norm_storedtimestamp', 'src_Octet1', 'src_Octet2', 'src_O

In [34]:
#drop the columns that are not required for the experiments
cols_to_drop = ['mver','endtime','msm_id','group_id','ProbeID_source','ProbeID_destination','Uptime_destination','Uptime_source']
model_df = model_df.drop(columns=cols_to_drop,axis=1)

In [35]:
#save the dataframe to a pickle file
model_df.to_pickle('latency_withprobe.pickle')