# Import Libraries

In [None]:
import os
import csv
import ast
import requests
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_rows', None)

# Get Packet Fields

In [None]:
def get_packet_field_names(packet):
    packet_fields = []
    nested_field_no = 0
    for proto in packet.findall('proto'):
        if proto.get('name') == 'nas-eps':
            for field in proto.findall('field'):
                field_name = field.get('name').replace(".", "_")
                if field_name == '':
                    nested_field_no = nested_field_no + 1
                    field_name = 'nested_field' + str(nested_field_no)
                packet_fields.extend([field_name + '_show', field_name + '_value', field_name + '_size', field_name + '_unmaskedvalue'])
                for nested_field in field.findall('field'):
                    nested_field_name = nested_field.get('name').replace(".", "_")
                    packet_fields.extend([nested_field_name + '_show', nested_field_name + '_value', nested_field_name + '_size', nested_field_name + '_unmaskedvalue'])
    return packet_fields

# Align Packet Features

In [None]:
def align_lists(list1, list2, list2_values):
    list2_values_aligned = []
    list2_aligned = []
    for x in list1:
        if x not in list2:
            list2_values_aligned.append('')
        else:
            idx = list2.index(x)
            list2_values_aligned.append(list2_values[idx])
    return list2_values_aligned

# Make Dataframe from Packet Features

In [None]:
def makeDataframe(xmlfile):
    dataframe_list = []
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    packets = root.findall('packet')
    combined_df = pd.DataFrame()
    column_names = []
    values = []

    column_names = []
    for packet in packets:
        columns = get_packet_field_names(packet)
        column_names = set(column_names).union(set(columns))
    column_names = list(column_names)
    for packet in packets:
        packet_fields = []
        for proto in packet.findall('proto'):
            selected_fields = []
            if proto.get('name') == 'nas-eps':
                for field in proto.findall('field'):
                    packet_fields.extend([field.get('show'), field.get('value'), field.get('size'), field.get('unmaskedvalue')])
                    for nested_field in field.findall('field'):
                        packet_fields.extend([nested_field.get('show'), nested_field.get('value'), nested_field.get('size'), nested_field.get('unmaskedvalue')])
        columns = get_packet_field_names(packet)
        values.append(align_lists(column_names, columns, packet_fields))
    return list(column_names), values

# Prepare Dataframe from PCAP File

In [None]:
def prepare_dataframe_from_pcap_file(input_file):
    xml_output_file = input_file.replace("pcap", "xml")
    os.system("tshark -r " + input_file + " -T pdml > " + xml_output_file)
    column_names, values = makeDataframe(xml_output_file)
    df = pd.DataFrame(values, columns=column_names)
    return df

# Process NAS Traces

In [None]:
df = pd.DataFrame()
input_folder = "../example_traces/nas/"
for input_file in os.listdir(input_folder):
    try:
        df_i = prepare_dataframe_from_pcap_file(input_folder + input_file)
    except IsADirectoryError as e:
        continue
    df = pd.concat([df, df_i])

In [None]:
os.makedirs("../output", exist_ok=True)
df.to_csv("../output/nas_data_raw.csv")

## Encode Categorical Values

In [None]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].fillna("Unknown")
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

## Handle Missing values

In [None]:
# df = df.dropna(axis=1, thresh=len(df) * 0.5) # Drop columns with more than 50% missing values for better result
# df = df.loc[:, df.nunique() > 1] # Drop columns with the same value in all rows (no variation) for better result
df = df.replace(r'^\s*$', np.nan, regex=True).fillna(-1) # replace empty cells (" ") with -1
df = df.fillna(-1) # replace all other missing values with -1

## Label Dataset

In [None]:
folder_path = "../example_trace_labels/nas/"
all_labels = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        with open(os.path.join(folder_path, file_name), "r") as f:
            content = f.read().strip()
            labels = ast.literal_eval(content.split("=", 1)[1].strip())
            all_labels.extend(labels)
df["label"] = all_labels[:len(df)]  # truncate if labels > rows

In [None]:
df['label'].value_counts()

## Save processed dataset

In [None]:
df.to_csv("../output/fbs_nas.csv")