# Import Libraries

In [None]:
import os
import csv
import requests
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_rows', None)

# Get Packet Fields

In [None]:
def get_nested_field_names(field_element):
    nested_field_names = []
    for nested_field in field_element.findall('field'):
        field_name = nested_field.get('name').replace(".", "_")
        if field_name == '':
            continue
        nested_field_names.extend([field_name + '_name', field_name + '_showname', field_name + '_size', 
                                              field_name + '_pos', field_name + '_show', field_name + '_value'])
        nested_field_names.extend(get_nested_field_names(nested_field))  # Recursively collect nested fields
    return nested_field_names

In [None]:
def get_packet_field_names(packet):
    packet_fields = []
    for proto_lvl1 in packet.findall('proto'):
        if proto_lvl1.get('name') == 'mac-lte':
            for proto_lvl2 in proto_lvl1.findall('proto'):
                if proto_lvl2.get('name') == 'lte_rrc':
                    packet_fields = []
                    for field in proto_lvl2.findall('field'):
                        field_name = field.get('name').replace(".", "_")
                        if field_name == '':
                            continue
                        packet_fields.extend([field_name + '_name', field_name + '_showname', field_name + '_size', 
                                              field_name + '_pos', field_name + '_show', field_name + '_value'])
                        packet_fields.extend(get_nested_field_names(field))  # Collect nested fields
                elif proto_lvl2.get('name') == 'rlc-lte':
                    for proto_lvl3 in proto_lvl2.findall('proto'):
                        if proto_lvl3.get('name') == 'pdcp-lte':
                            for proto_lvl4 in proto_lvl3.findall('proto'):
                                if proto_lvl4.get('name') == 'lte_rrc':
                                    packet_fields = []
                                    for field in proto_lvl4.findall('field'):
                                        field_name = field.get('name').replace(".", "_")
                                        if field.get('name') == '':
                                            continue
                                        packet_fields.extend([field_name + '_name', field_name + '_showname', field_name + '_size', 
                                          field_name + '_pos', field_name + '_show', field_name + '_value'])
                                        packet_fields.extend(get_nested_field_names(field))  # Collect nested fields
    return packet_fields

# Align Packet Features

In [None]:
def align_lists(list1, list2, list2_values):
    list2_values_aligned = []
    list2_aligned = []
    for x in list1:
        if x not in list2:
            list2_values_aligned.append('')
        else:
            idx = list2.index(x)
            list2_values_aligned.append(list2_values[idx])
    return list2_values_aligned

# Make Dataframe from Packet Features

In [None]:
def get_nested_fields(field_element):
    nested_fields = []
    for nested_field in field_element.findall('field'):
        if nested_field.get('name') == '':
            continue
        nested_fields.extend([
                nested_field.get('name'),
                nested_field.get('showname'),
                nested_field.get('size'),
                nested_field.get('pos'),
                nested_field.get('show'),
                nested_field.get('value')
            ])
        nested_fields.extend(get_nested_fields(nested_field))  # Recursively collect nested fields
    return nested_fields

In [None]:
def makeDataframe(xmlfile):
    dataframe_list = []
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    packets = root.findall('packet')
    combined_df = pd.DataFrame()
    column_names = []
    values = []
    
    flag = False
    
    column_names = []
    for packet in packets:
        columns = get_packet_field_names(packet)
        column_names = set(column_names).union(set(columns))
    column_names = list(column_names)
    i = 1
    for packet in packets:
        for proto_lvl1 in packet.findall('proto'):
            if proto_lvl1.get('name') == 'mac-lte':
                for proto_lvl2 in proto_lvl1.findall('proto'):
                    if proto_lvl2.get('name') == 'lte_rrc':
                        flag = True
                        packet_fields = []
                        for field in proto_lvl2.findall('field'):
                            if field.get('name') == '':
                                continue
                            packet_fields.extend([
                                field.get('name'),
                                field.get('showname'),
                                field.get('size'),
                                field.get('pos'),
                                field.get('show'),
                                field.get('value')
                            ])
                            packet_fields.extend(get_nested_fields(field))  # Collect nested fields
                    elif proto_lvl2.get('name') == 'rlc-lte':
                            for proto_lvl3 in proto_lvl2.findall('proto'):
                                if proto_lvl3.get('name') == 'pdcp-lte':
                                    for proto_lvl4 in proto_lvl3.findall('proto'):
                                        if proto_lvl4.get('name') == 'lte_rrc':
                                            flag = True
                                            packet_fields = []
                                            for field in proto_lvl4.findall('field'):
                                                if field.get('name') == '':
                                                    continue
                                                packet_fields.extend([
                                                    field.get('name'),
                                                    field.get('showname'),
                                                    field.get('size'),
                                                    field.get('pos'),
                                                    field.get('show'),
                                                    field.get('value')
                                                ])
                                                packet_fields.extend(get_nested_fields(field))  # Collect nested fields
        if flag:
            columns = get_packet_field_names(packet)
            values.append(align_lists(column_names, columns, packet_fields))
            flag = False
        i = i + 1
    return list(column_names), values

# Prepare Dataframe from PCAP File

In [None]:
def prepare_dataframe_from_pcap_file(input_file):
    xml_output_file = input_file.replace("pcap", "xml")
    os.system("tshark -r " + input_file + " -T pdml > " + xml_output_file)
    column_names, values = makeDataframe(xml_output_file)
    df = pd.DataFrame(values, columns=column_names)
    return df

In [None]:
df = pd.DataFrame()
input_folder = "../example_traces/rrc/"
for input_file in os.listdir(input_folder):
    try:
        df_i = prepare_dataframe_from_pcap_file(input_folder + input_file)
    except IsADirectoryError as e:
        continue
    df = pd.concat([df, df_i])

In [None]:
df.shape

In [None]:
os.makedirs("../output", exist_ok=True)
df.to_csv("../output/rrc_data_raw.csv")

## Handle Missing values

In [None]:
df = df.dropna(axis=1, thresh=len(df) * 0.5) # Drop columns with more than 50% missing values
df = df.loc[:, df.nunique() > 1] # Drop columns with the same value in all rows (no variation)
df = df.replace(r'^\s*$', np.nan, regex=True).fillna(-1) # replace empty cells (" ") with -1
df = df.fillna(-1) # replace all other missing values with -1

In [None]:
df.shape

## Save processed dataset

In [None]:
df.to_csv("../output/rrc_data_cleaned.csv")