In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew

In [3]:
def precise_diff(float_array):
    # Truncate down to 4 digits on the left, 6 digits on the right, and convert to int
    float_array = np.apply_along_axis(lambda x: (x % 10000)*1000000, axis=-1, arr=float_array)
    
    int_array = float_array.astype(np.int64)
    #print(int_array)

    # Calculate differences, then return to floating point value
    result = np.diff(int_array)
    #print(np.where(result < 0))
    #print(min(result))
    #print(max(result))
    result = np.apply_along_axis(lambda x: x/1000000, axis=-1, arr=result)
    #print(result)
    
    return(result)

In [4]:
def convert_to_2d_array(array_of_arrays):
    # Determine the number of rows
    num_rows = len(array_of_arrays)
    
    # Determine the number of columns (assuming all inner arrays have the same length)
    num_cols = len(array_of_arrays[0]) if num_rows > 0 else 0

    # Initialize a 2D NumPy array with the correct shape
    matrix = np.empty((num_rows, num_cols), dtype=float)

    # Populate the 2D NumPy array
    for i in range(num_rows):
        for j in range(num_cols):
            matrix[i, j] = float(array_of_arrays[i][j])
    
    return matrix

In [18]:
# Function to calculate statistics for each row
def calculate_statistics(row):
    # Convert inner arrays to numpy arrays for numerical operations
    #packets = np.array(row['Data'], dtype=float) #PROBLEM
    
    # Extract packet data and convert inner arrays to numpy arrays for numerical operations
    packets = convert_to_2d_array(row['Data'])

    # Extract the sizes
    sizes = packets[:, 2].astype(float)
    
    # Extract the timestamps
    timestamps = packets[:, 0].astype(float)
    
    # Calculate inter-arrival times as differences between consecutive timestamps
    inter_arrival_times = np.diff(timestamps)
    
    # Calculate inter-arrival time statistics
    iat_min = np.min(inter_arrival_times)
    iat_max = np.max(inter_arrival_times)
    iat_mean = np.mean(inter_arrival_times)
    iat_std = np.std(inter_arrival_times)
    iat_skew = skew(inter_arrival_times)

    # Calculate size statistics
    size_min = np.min(sizes)
    size_max = np.max(sizes)
    size_mean = np.mean(sizes)
    size_std = np.std(sizes)
    size_skew = skew(sizes)

    # Calculate general statistics
    total_packets = len(packets)
    total_bytes = np.sum(sizes)
    duration = timestamps[-1] - timestamps[0] if total_packets > 1 else 0
    packets_per_sec = total_packets / duration if duration > 0 else 0
    bytes_per_sec = total_bytes / duration if duration > 0 else 0
    
    # Return all calculated statistics as a dictionary
    return {
        'iat_min': iat_min, 'iat_max': iat_max, 'iat_mean': iat_mean, 'iat_std': iat_std, 'iat_skew': iat_skew,
        'size_min': size_min, 'size_max': size_max, 'size_mean': size_mean, 'size_std': size_std, 'size_skew': size_skew,
        'total_packets': total_packets, 'total_bytes': total_bytes, 'packets_per_sec': packets_per_sec, 'bytes_per_sec': bytes_per_sec
    }

In [35]:
# Function to calculate statistics for each row based on a given direction
def calculate_statistics_by_direction(row, direction=0):
    # Convert inner arrays to numpy arrays for numerical operations
    #packets = np.array(row['Data'], dtype=float) #problem
    
    # Extract packet data and convert inner arrays to numpy arrays for numerical operations
    packets = convert_to_2d_array(row['Data'])
    
    # Filter packets by direction
    filtered_packets = packets[packets[:, 3] == direction]
    print(packets[:,3])
    print(type(packets[:,3]))
    print(direction)
    print(type(direction))
    
    print(len(filtered_packets))

    # Extract the timestamps and sizes for filtered packets
    timestamps = filtered_packets[:, 0].astype(float)
    sizes = filtered_packets[:, 2].astype(float)

    # Calculate inter-arrival times as differences between consecutive timestamps
    inter_arrival_times = np.diff(timestamps)
    
    print(len(inter_arrival_times))

    # Calculate inter-arrival time statistics
    iat_min = np.min(inter_arrival_times) if len(inter_arrival_times) > 0 else np.nan
    iat_max = np.max(inter_arrival_times) if len(inter_arrival_times) > 0 else np.nan
    iat_mean = np.mean(inter_arrival_times) if len(inter_arrival_times) > 0 else np.nan
    iat_std = np.std(inter_arrival_times) if len(inter_arrival_times) > 0 else np.nan
    iat_skew = skew(inter_arrival_times) if len(inter_arrival_times) > 0 else np.nan

    # Calculate size statistics
    size_min = np.min(sizes)
    size_max = np.max(sizes)
    size_mean = np.mean(sizes)
    size_std = np.std(sizes)
    size_skew = skew(sizes)

    # Calculate general statistics
    total_packets = len(filtered_packets)
    total_bytes = np.sum(sizes)
    duration = timestamps[-1] - timestamps[0] if total_packets > 1 else 0
    packets_per_sec = total_packets / duration if duration > 0 else 0
    bytes_per_sec = total_bytes / duration if duration > 0 else 0

    # Return all calculated statistics as a dictionary
    return {
        'iat_min': iat_min, 'iat_max': iat_max, 'iat_mean': iat_mean, 'iat_std': iat_std, 'iat_skew': iat_skew,
        'size_min': size_min, 'size_max': size_max, 'size_mean': size_mean, 'size_std': size_std, 'size_skew': size_skew,
        'total_packets': total_packets, 'total_bytes': total_bytes, 'packets_per_sec': packets_per_sec, 'bytes_per_sec': bytes_per_sec
    }

In [10]:
def create_statistics_matrix_numpy(row):
    # Call the statistics functions
    stats_general = calculate_statistics(row)
    stats_dir0 = calculate_statistics_by_direction(row, 0)
    stats_dir1 = calculate_statistics_by_direction(row, 1)
    
    # Convert dictionaries to lists
    stats_general_list = list(stats_general.values())
    stats_dir0_list = list(stats_dir0.values())
    stats_dir1_list = list(stats_dir1.values())
    
    # Combine the lists into a single (3, 14) numpy array
    statistics_matrix = np.array([stats_general_list, stats_dir0_list, stats_dir1_list])
    
    return statistics_matrix

In [36]:
def create_statistics_matrix(row):
    # Call the statistics functions
    stats_general = calculate_statistics(row)
    stats_dir0 = calculate_statistics_by_direction(row, 0)
    stats_dir1 = calculate_statistics_by_direction(row, 1)
    
    print(stats_dir0)
    
    # Convert dictionaries to lists
    stats_general_list = list(stats_general.values())
    stats_dir0_list = list(stats_dir0.values())
    stats_dir1_list = list(stats_dir1.values())
    
    print(stats_dir0_list)
    
    '''
    # Ensure each list has the correct size (14 in this case)
    # If not, pad with zeros or truncate as needed
    target_size = 14
    
    def pad_or_truncate(lst, size):
        if len(lst) < size:
            return lst + [0] * (size - len(lst))
        return lst[:size]

    # Pad or truncate each list to the target size
    stats_general_list = pad_or_truncate(stats_general_list, target_size)
    stats_dir0_list = pad_or_truncate(stats_dir0_list, target_size)
    stats_dir1_list = pad_or_truncate(stats_dir1_list, target_size)
    '''

    # Combine the lists into a single list of lists
    statistics_list_of_lists = [stats_general_list, stats_dir0_list, stats_dir1_list]
    
    return statistics_list_of_lists

In [38]:
# Load DataFrame
df = pd.read_parquet('quic_text.parquet')

df

Unnamed: 0,Label,Data
0,Youtube,"[[1522933033.872636000, 0, 1412, 1], [15229330..."
1,Youtube,"[[1522986010.180674000, 0, 1412, 1], [15229860..."
2,Youtube,"[[1522998647.328589000, 0, 1412, 1], [15229986..."
3,Youtube,"[[1522952270.580016000, 0, 1412, 1], [15229522..."
4,Youtube,"[[1522963305.433367000, 0, 1412, 1], [15229633..."
5,Youtube,"[[1522984486.132539000, 0, 1412, 1], [15229844..."
6,Youtube,"[[1522923877.285549000, 0, 1412, 1], [15229238..."
7,Google Doc,"[[1527761242.418825000, 0, 295, 1], [152776124..."
8,Google Doc,"[[1527939536.102296000, 0, 896, 1], [152793953..."
9,Google Doc,"[[1528078338.496653000, 0, 294, 1], [152807833..."


In [39]:
# Initialize an empty list to store the data for the new DataFrame
data_for_new_df = []

# Iterate over each row of the original dataframe
for index, row in df.iterrows():
    # Extract the label
    label = row['Label']
    
    print(row)
    
    # Compute the statistics matrix for the current row
    statistics_matrix = create_statistics_matrix(row)
    
    # Append the label and matrix as a tuple to the list
    data_for_new_df.append({'Label': label, 'Matrix': statistics_matrix})

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(data_for_new_df)

# Print the new dataframe to see the result
print(new_df)

Label                                              Youtube
Data     [[1522933033.872636000, 0, 1412, 1], [15229330...
Name: 0, dtype: object
[1. 1. 0. ... 0. 1. 0.]
<class 'numpy.ndarray'>
0
<class 'int'>
22653
22652
[1. 1. 0. ... 0. 1. 0.]
<class 'numpy.ndarray'>
1
<class 'int'>
3247
3246
{'iat_min': 9.5367431640625e-07, 'iat_max': 5.992645025253296, 'iat_mean': 0.0027196397213849126, 'iat_std': 0.10185753631456705, 'iat_skew': 47.76948023385541, 'size_min': 89.0, 'size_max': 1412.0, 'size_mean': 1401.388160508542, 'size_std': 94.846296582863, 'size_skew': -10.196688837338659, 'total_packets': 22653, 'total_bytes': 31745646.0, 'packets_per_sec': 367.71199447807965, 'bytes_per_sec': 515307.23553856317}
[9.5367431640625e-07, 5.992645025253296, 0.0027196397213849126, 0.10185753631456705, 47.76948023385541, 89.0, 1412.0, 1401.388160508542, 94.846296582863, -10.196688837338659, 22653, 31745646.0, 367.71199447807965, 515307.23553856317]
Label                                              You

[1. 1. 0. ... 0. 1. 0.]
<class 'numpy.ndarray'>
0
<class 'int'>
8763
8762
[1. 1. 0. ... 0. 1. 0.]
<class 'numpy.ndarray'>
1
<class 'int'>
1309
1308
{'iat_min': 9.5367431640625e-07, 'iat_max': 4.421344041824341, 'iat_mean': 0.001006455025262578, 'iat_std': 0.05533788759034251, 'iat_skew': 71.27460160084522, 'size_min': 92.0, 'size_max': 1412.0, 'size_mean': 1399.0775989957776, 'size_std': 100.55653489623123, 'size_skew': -8.700710963166737, 'total_packets': 8763, 'total_bytes': 12260117.0, 'packets_per_sec': 993.6997720621687, 'bytes_per_sec': 1390263.0912193905}
[9.5367431640625e-07, 4.421344041824341, 0.001006455025262578, 0.05533788759034251, 71.27460160084522, 92.0, 1412.0, 1399.0775989957776, 100.55653489623123, -8.700710963166737, 8763, 12260117.0, 993.6997720621687, 1390263.0912193905]
Label                                         Google Drive
Data     [[1522756094.978806000, 0, 213, 1], [152275609...
Name: 14, dtype: object
[1. 0. 0. ... 0. 0. 1.]
<class 'numpy.ndarray'>
0
<clas

In [41]:
# Save DataFrame to a Parquet file using pyarrow
new_df.to_parquet('stnn_features.parquet', engine='pyarrow')

# Save DataFrame to CSV file
new_df.to_csv("stnn_features.csv", index=False)

### Problem solving

In [37]:
matrix = create_statistics_matrix(df.iloc[0,:])
print(matrix)

[1. 1. 0. ... 0. 1. 0.]
<class 'numpy.ndarray'>
0
<class 'int'>
22653
22652
[1. 1. 0. ... 0. 1. 0.]
<class 'numpy.ndarray'>
1
<class 'int'>
3247
3246
{'iat_min': 9.5367431640625e-07, 'iat_max': 5.992645025253296, 'iat_mean': 0.0027196397213849126, 'iat_std': 0.10185753631456705, 'iat_skew': 47.76948023385541, 'size_min': 89.0, 'size_max': 1412.0, 'size_mean': 1401.388160508542, 'size_std': 94.846296582863, 'size_skew': -10.196688837338659, 'total_packets': 22653, 'total_bytes': 31745646.0, 'packets_per_sec': 367.71199447807965, 'bytes_per_sec': 515307.23553856317}
[9.5367431640625e-07, 5.992645025253296, 0.0027196397213849126, 0.10185753631456705, 47.76948023385541, 89.0, 1412.0, 1401.388160508542, 94.846296582863, -10.196688837338659, 22653, 31745646.0, 367.71199447807965, 515307.23553856317]
[[-1.9073486328125e-06, 5.972417116165161, 0.0023794474668579033, 0.09481284554922456, 51.131058003398614, 89.0, 1412.0, 1242.074942084942, 430.56725058861224, -2.1757881672659827, 25900, 3216974

In [34]:
statistics_col = calculate_statistics_by_direction(df.iloc[0,:],1)
print(statistics_col)

[1. 1. 0. ... 0. 1. 0.]
<class 'numpy.ndarray'>
1
<class 'int'>
3247
3246
{'iat_min': 3.2901763916015625e-05, 'iat_max': 5.990344047546387, 'iat_mean': 0.018976704837801422, 'iat_std': 0.26815998283153986, 'iat_skew': 18.018628122161758, 'size_min': 97.0, 'size_max': 1412.0, 'size_mean': 130.61133353865105, 'size_std': 59.8635330903881, 'size_skew': 10.00483540120466, 'total_packets': 3247, 'total_bytes': 424095.0, 'packets_per_sec': 52.712421888966574, 'bytes_per_sec': 6884.839716969905}


In [20]:
def contains_nan(list_of_lists):
    # Flatten the list of lists and convert to a numpy array
    flattened_array = np.array([item for sublist in list_of_lists for item in sublist])
    
    # Check for NaNs
    return np.isnan(flattened_array).any()

In [21]:
def check_nan_in_dataset(df):
    # Apply the `contains_nan` function to each row's matrix
    df['Has_NaN'] = df['Matrix'].apply(contains_nan)
    return df

# Example usage
df_with_nan_check = check_nan_in_dataset(new_df)
print(df_with_nan_check)

           Label                                             Matrix  Has_NaN
0        Youtube  [[-1.9073486328125e-06, 5.972417116165161, 0.0...     True
1        Youtube  [[0.0, 9.354390859603882, 0.003731038111465003...     True
2        Youtube  [[-1.1920928955078125e-05, 7.244797945022583, ...     True
3        Youtube  [[-1.9073486328125e-06, 8.704989910125732, 0.0...     True
4        Youtube  [[0.0, 10.431927919387817, 0.00555849588458725...     True
5        Youtube  [[0.0, 15.403187990188599, 0.01212164598790228...     True
6        Youtube  [[-3.0994415283203125e-06, 5.99469780921936, 0...     True
7     Google Doc  [[-1.9073486328125e-06, 4.341797828674316, 0.0...     True
8     Google Doc  [[1.0967254638671875e-05, 3.5862808227539062, ...     True
9     Google Doc  [[2.002716064453125e-05, 5.092674016952515, 0....     True
10    Google Doc  [[1.9073486328125e-06, 9.54417109489441, 0.062...     True
11    Google Doc  [[9.5367431640625e-07, 3.384873151779175, 0.03...     True

In [None]:
# Check for NaNs
df_with_nan_check = check_nan_in_dataset(new_df)
print(df_with_nan_check)

In [9]:
matrix = new_df.iloc[0,1]
print(type(matrix))

<class 'numpy.ndarray'>


In [34]:
# Sample array of arrays from a dataframe row
sample_data = [
    ['1522933033.872636000', '0', '1412', '1'],
    ['1522933033.872846000', '0.000209808', '350', '1'],
    ['1522933034.743008000', '0.870372', '930', '0']
]

# Convert the sample data to a 2D NumPy array
my_row = df.iloc[0]
my_array = df.iloc[0,1]

matrix = convert_to_2d_array(my_array)
matrix

array([[1.52293303e+09, 0.00000000e+00, 1.41200000e+03, 1.00000000e+00],
       [1.52293303e+09, 2.09808000e-04, 3.50000000e+02, 1.00000000e+00],
       [1.52293303e+09, 2.00310000e-02, 1.41200000e+03, 0.00000000e+00],
       ...,
       [1.52293310e+09, 6.15970000e+01, 1.27700000e+03, 0.00000000e+00],
       [1.52293310e+09, 6.15984000e+01, 1.23000000e+02, 1.00000000e+00],
       [1.52293310e+09, 6.16253000e+01, 1.49000000e+02, 0.00000000e+00]])

In [41]:
# Extract the timestamps
timestamps = matrix[:, 0].astype(float)
print(timestamps)

[1.52293303e+09 1.52293303e+09 1.52293303e+09 ... 1.52293310e+09
 1.52293310e+09 1.52293310e+09]


In [43]:
# Calculate inter-arrival times as differences between consecutive timestamps
inter_arrival_times = np.diff(timestamps)

print(inter_arrival_times)
print(np.min(inter_arrival_times))

[0.00020981 0.01982117 0.00114489 ... 0.00069284 0.00135708 0.02692604]
-1.9073486328125e-06


In [26]:
# Compute the statistics matrix for the current row
statistics_matrix = create_statistics_matrix(my_row)
statistics_matrix

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [25]:
# Extract the sizes
sizes = matrix[:, 2].astype(float)

print(sizes)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [68]:
packets = convert_to_2d_array(df.iloc[0,1])
print(packets[:,0])

diffs = precise_diff(packets[:,0])
print(min(diffs))
print(max(diffs))

[1.52293303e+09 1.52293303e+09 1.52293303e+09 ... 1.52293310e+09
 1.52293310e+09 1.52293310e+09]
[3033872636 3033872845 3033892667 ... 3095469662 3095471019 3095497946]
(array([22132], dtype=int64),)
5972418
[0.000209 0.019822 0.001144 ... 0.000692 0.001357 0.026927]
-1e-06
5.972418
