# Setup

In [None]:
%pip install pyshark
%pip install nest_asyncio
%pip install pandas

import sys
sys.path.append('..')

from src.preprocessors.pcap_preprocessor import PcapPreprocessor
import nest_asyncio
nest_asyncio.apply()


preprocessor = PcapPreprocessor()

base_data = preprocessor.get_all_data()
print(base_data)

# Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from src.visualization.visualization import Visualization
from IPython.display import display, HTML

visualization = Visualization()

# Check available columns in base_data
print("Columns in base_data:", base_data.columns.tolist())

In [None]:
print(base_data.info())

In [None]:
print("Column Information:")
display(HTML(base_data.dtypes.to_frame().to_html(header=["Data Type"], index=True)))

In [None]:
print("\nFirst 5 lines of the dataset as a table:")
display(HTML(base_data.head(5).to_html()))
base_data['eth.dst_lg'].nunique()

In [None]:
# Function to filter columns by missing values and display the desired outputs
def filter_columns_by_missing_values(base_data, threshold=0.85):
    # Calculate the percentage of missing values for each column
    missing_percentage = base_data.isnull().mean()
    
    # Get columns with 0% missing values
    zero_missing = missing_percentage[missing_percentage == 0]
    zero_missing_list = zero_missing.index.tolist()
    
    # Get columns with missing values less than the threshold and greater than 0
    valid_columns = missing_percentage[(missing_percentage < threshold) & (missing_percentage > 0)]
    
    # Get columns with more than 85% missing values
    high_missing = missing_percentage[missing_percentage > threshold]
    high_missing_list = high_missing.index.tolist()
    
    # Create a DataFrame to display columns with less than 85% missing (excluding 0% missing)
    valid_columns_df = pd.DataFrame({
        'Column': valid_columns.index,
        'Missing Percentage': valid_columns.values * 100  # Convert to percentage
    })

    # Display the list and table
    print("Columns with 0% Missing Values:", zero_missing_list)
    print("Columns with >85% Missing Values:", high_missing_list)

    # Display the table with columns that have missing values less than 85% (excluding 0% missing)
    html_table = valid_columns_df.to_html(index=False, escape=False)
    display(HTML(f"<h3>Columns with <85% Missing Values (excluding 0%)</h3>{html_table}"))
    
    # Return the cleaned DataFrame with only valid columns
    base_data_cleaned = base_data[valid_columns.index]
    
    return base_data_cleaned

# Example usage
base_data_cleaned = filter_columns_by_missing_values(base_data, threshold=0.85)


In [None]:
visualization.plot_missing_values_heatmap(base_data_cleaned)

In [None]:
visualization.plot_tcp_source_ports(base_data)

In [None]:
visualization.plot_udp_source_ports(base_data)

In [None]:
visualization.plot_top_ip_addresses(base_data)

In [None]:
visualization.plot_top_ip_dest_addresses(base_data)

In [None]:
visualization.plot_tcp_flags_distribution(base_data)

In [None]:
visualization.plot_dns_queries(base_data)

In [None]:
visualization.plot_correlation_matrix(base_data)

In [None]:
visualization.plot_tcp_stream_time_series(base_data)

In [None]:
visualization.plot_distribution(base_data)