In [24]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

from ._graphdapp import GraphDAppModality
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
def convert_to_2d_array(array_of_arrays):
    # Determine the number of rows
    num_rows = len(array_of_arrays)
    
    # Determine the number of columns (assuming all inner arrays have the same length)
    num_cols = len(array_of_arrays[0]) if num_rows > 0 else 0

    # Initialize a 2D NumPy array with the correct shape
    matrix = np.empty((num_rows, num_cols), dtype=float)

    # Populate the 2D NumPy array
    for i in range(num_rows):
        for j in range(num_cols):
            matrix[i, j] = float(array_of_arrays[i][j])
    
    return matrix

In [22]:
class Graph:
    def __init__(self, n, features=None, f=None):
        self.adj = np.zeros((n, n), dtype=np.int8)
        if features is not None:
            self.features = features
        else:
            self.features = np.zeros((n, f))
    
    def connect(self, v1, v2):
        self.adj[v1, v2] = 1
        self.adj[v2, v1] = 1
        
    def connect_inner_burst(self, burst):
        if burst.length() == 1:
            return
        
        for node_id1, node_id2 in zip(burst.ids[:-1], burst.ids[1:]):
            self.connect(node_id1, node_id2)
            
    def connect_bursts(self, burst1, burst2):
        self.connect(burst1.first(), burst2.first())
        if burst1.length() == 1 and burst2.length() > 1:
            self.connect(burst1.first(), burst2.last())
        elif burst1.length() > 1 and burst2.length() == 1:
            self.connect(burst1.last(), burst2.first())
        else:
            self.connect(burst1.last(), burst2.last())
            
    def get_graph_data(self):
        """Returns the adjacency matrix and the node feature matrix."""
        return self.adj, self.features


class Burst:
    def __init__(self, values, ids):
        self.values = values
        self.ids = ids
        
    def length(self):
        return len(self.values)
    
    def first(self):
        return self.ids[0]
    
    def last(self):
        return self.ids[-1]
    
    def __str__(self):
        return self.__repr__()
    
    def __repr__(self):
        return str(list(zip(self.values, self.ids)))

In [43]:
def extract_values(flow):
    # Convert inner arrays to numpy arrays for numerical operations
    #packets = np.array(row['Data'], dtype=float) #PROBLEM
    
    # Extract packet data and convert inner arrays to numpy arrays for numerical operations
    packets = convert_to_2d_array(flow['Data'])
    
    # Keep only the first 32 packets
    truncated_packets = packets[:32]

    # Extract the sizes, dirs
    # sizes = packets[:, 2].astype(float)
    # dirs = packets[:, 3]
    
    # Create a vector where each product is multiplied by -1 if dirs is 0, otherwise multiply by 1
    values = np.where(truncated_packets[:, 3] == 0, truncated_packets[:, 2] * -1, truncated_packets[:, 2] * 1)

    #print(values)
    
    return values

In [4]:
def generate_simple_TIG(values):
    nodes = len(values)
    graph = Graph(
        nodes, 
        features=np.concatenate([
            np.array(values).reshape(-1, 1), 
        ])
    )
    bursts = get_bursts(values)
    
    for b in bursts:
        graph.connect_inner_burst(b)
    
    if len(bursts) > 1:
        for b1, b2 in zip(bursts[:-1], bursts[1:]):
            graph.connect_bursts(b1, b2)
    
    return graph


def get_bursts(data, ids=None):
    if ids == None:
        ids = np.arange(len(data))
    bursts = []
    last_sign = np.sign(data[0])
    last_id_idx = 0
    burst = []
    for val in data:
        if np.sign(val) != last_sign:
            bursts.append(Burst(burst, ids[last_id_idx:last_id_idx+len(burst)]))
            last_id_idx += len(burst)
            burst = []
        burst.append(val)
        last_sign = np.sign(val)
    bursts.append(Burst(burst, ids[last_id_idx:last_id_idx+len(burst)]))
        
    return bursts

In [53]:
# Load DataFrame
df = pd.read_parquet('quic_text.parquet')

df

Unnamed: 0,Label,Data,num_of_packets
0,Youtube,"[[1522982445.960145000, 0, 1412, 1], [15229824...",7288
1,Youtube,"[[1522939930.491890000, 0, 1412, 1], [15229399...",22529
2,Youtube,"[[1522994330.247572000, 0, 1412, 1], [15229943...",17938
3,Youtube,"[[1522908947.040100000, 0, 1412, 1], [15229089...",22709
4,Youtube,"[[1523000044.440979000, 0, 86, 1], [1523000044...",26621
...,...,...,...
3927,Google Drive,"[[1522841219.402012000, 0, 161, 1], [152284121...",11682
3928,Google Drive,"[[1522835303.978634000, 0, 566, 1], [152283530...",10763
3929,Google Drive,"[[1522782530.426202000, 0, 86, 1], [1522782530...",10133
3930,Google Drive,"[[1522763269.225023000, 0, 75, 1], [1522763269...",15547


In [54]:
# Initialize an empty list to store the data for the new DataFrame
data_for_new_df = []

# Iterate over each row of the original dataframe
for index, row in df.iterrows():
    # Extract the label
    label = row['Label']
    
    #print(row)
    
    # Compute the statistics matrix for the current row
    graph = generate_simple_TIG(extract_values(row))
    adj_matrix, features = graph.get_graph_data()
    
    #int_matrix = adj_matrix.astype(np.int8)
    
    #print(type(adj_matrix))
    # Convert to a list of lists
    list_matrix = adj_matrix.tolist()
    list_feat = features.tolist()
    
    
    # Convert to a sparse matrix
    #sparse_matrix = csr_matrix(int_matrix)

    #print(list_matrix)
    #print(type(list_matrix))
    
    # Append the label and matrix as a tuple to the list
    data_for_new_df.append({'Label': label, 'Adj_Matrix': list_matrix, 'Features': list_feat})

# Create a new DataFrame from the list of dictionaries
new_df = pd.DataFrame(data_for_new_df)

# Print the new dataframe to see the result
print(new_df)

             Label                                         Adj_Matrix  \
0          Youtube  [[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
1          Youtube  [[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
2          Youtube  [[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
3          Youtube  [[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
4          Youtube  [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
...            ...                                                ...   
3927  Google Drive  [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
3928  Google Drive  [[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
3929  Google Drive  [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
3930  Google Drive  [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
3931  Google Drive  [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   

                                               Features  
0     [[1412.0], [950.0], [668.0], [-1412.0], [103.0...  
1     [

In [55]:
# Save DataFrame to a Parquet file using pyarrow
new_df.to_parquet('tig_features.parquet', engine='pyarrow')

# Save DataFrame to CSV file
new_df.to_csv("tig_features.csv", index=False)

In [50]:
csv_df = pd.read_csv("tig_features.csv")

In [51]:
csv_df

Unnamed: 0,Label,Adj_Matrix,Features
0,Youtube,"[[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1412.0], [350.0], [-1412.0], [103.0], [97.0]..."
1,Youtube,"[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1412.0], [965.0], [669.0], [-1412.0], [103.0..."
2,Youtube,"[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1412.0], [911.0], [731.0], [-1412.0], [103.0..."
3,Youtube,"[[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1412.0], [970.0], [-1412.0], [103.0], [97.0]..."
4,Youtube,"[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1412.0], [908.0], [722.0], [-1412.0], [103.0..."
5,Youtube,"[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1412.0], [350.0], [136.0], [-1412.0], [-93.0..."
6,Youtube,"[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[1412.0], [351.0], [136.0], [-1412.0], [-1412..."
7,Google Doc,"[[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[295.0], [-87.0], [-415.0], [-1412.0], [104.0..."
8,Google Doc,"[[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[896.0], [-87.0], [-328.0], [101.0], [-1412.0..."
9,Google Doc,"[[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[294.0], [-87.0], [-325.0], [-1412.0], [-1412..."


In [52]:
adj = csv_df.iloc[0,1]
feat = csv_df.iloc[0,2]

print(adj)
print(feat)

[[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 