In [None]:
### REMOVE CELL
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession



# Group similar features
trasimitted_in_features = ["IN_BYTES", "IN_PKTS"]
trasmitted_out_features = ["OUT_BYTES", "OUT_PKTS"]
retransimtted_in_features = ["RETRANSMITTED_IN_BYTES", "RETRANSMITTED_IN_PKTS"]
retransimtted_out_features = ["RETRANSMITTED_OUT_BYTES", "RETRANSMITTED_OUT_PKTS"]
throughput_in_features = ["SRC_TO_DST_SECOND_BYTES", "SRC_TO_DST_AVG_THROUGHPUT"]
throughput_out_features = ["DST_TO_SRC_AVG_THROUGHPUT", "DST_TO_SRC_SECOND_BYTES"]
ttl_features = ["MIN_TTL", "MAX_TTL"]
tcp_flags_features = ["CLIENT_TCP_FLAGS", "SERVER_TCP_FLAGS", "TCP_FLAGS"]
icmp_features = ["ICMP_TYPE", "ICMP_IPV4_TYPE"]
flow_max_features = ["LONGEST_FLOW_PKT", "MAX_IP_PKT_LEN"]
flow_min_features = ["SHORTEST_FLOW_PKT", "MIN_IP_PKT_LEN"]
flow_duration_features = ["FLOW_DURATION_MILLISECONDS"]
tcp_in_features = ["TCP_WIN_MAX_IN"]
tcp_out_features = ["TCP_WIN_MAX_OUT"]

# Assemble and scale each group
assemblers_and_scalers = []

def create_assembler_and_scaler(inputCols, outputCol):
    assembler = VectorAssembler(inputCols=inputCols, outputCol=outputCol)
    scaler = StandardScaler(inputCol=outputCol, outputCol=f"scaled_{outputCol}", withStd=True, withMean=True)
    return assembler, scaler

grouped_features = {
    "trasmitted_in": trasimitted_in_features,
    "trasmitted_out": trasmitted_out_features,
    "retransimtted_in": retransimtted_in_features,
    "retransimtted_out": retransimtted_out_features,
    "throughput_in": throughput_in_features,
    "throughput_out": throughput_out_features,
    "ttl": ttl_features,
    "tcp_flags": tcp_flags_features,
    "icmp": icmp_features,
    "flow_max": flow_max_features,
    "flow_min": flow_min_features,
    "flow_duration": flow_duration_features,
    "tcp_in": tcp_in_features,
    "tcp_out": tcp_out_features,
}

for group_name, features in grouped_features.items():
    assembler, scaler = create_assembler_and_scaler(features, f"{group_name}_features")
    assemblers_and_scalers.append(assembler)
    assemblers_and_scalers.append(scaler)

# Assemble all scaled features into a single feature vector
all_scaled_features = [f"scaled_{group_name}_features" for group_name in grouped_features.keys()]
final_assembler = VectorAssembler(inputCols=all_scaled_features, outputCol="final_features")

# Add PCA
COMPONENTS = 3  # Number of desired principal components
pca = PCA(k=COMPONENTS, inputCol="final_features", outputCol="pca_features")

# Create the complete pipeline
pipeline = Pipeline(stages=assemblers_and_scalers + [final_assembler, pca])

# Fit the pipeline and transform the data
pca_model = pipeline.fit(encoded_train_data)
pca_result = pca_model.transform(encoded_train_data)





In [None]:
## REMOVE CELL

from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
## COPY THIS AND TRY WITH ML DATASET
FEATURES = all_scaled_features
COMPONENTS = 3
def PCA_analysis_variance(data, features=FEATURES, components=COMPONENTS):
    # Esegui l'analisi PCA e ottieni i risultati
    assembler = VectorAssembler(inputCols=features, outputCol="features")
    scaler = StandardScaler(
        inputCol="features", outputCol="scaled_features", withStd=True, withMean=True
    )
    pca = PCA(k=components, inputCol="scaled_features", outputCol="pca_features")
    pipeline = Pipeline(stages=[assembler, scaler, pca])
    pca_model = pipeline.fit(data)
    pca_result = pca_model.transform(data)

    return pca_result, pca_model


# Perform PCA analysis
pca_result, pca_model = PCA_analysis_variance(encoded_train_data)

IllegalArgumentException: scaled_trasmitted_in_features does not exist. Available: L4_SRC_PORT, L4_DST_PORT, PROTOCOL, L7_PROTO, IN_BYTES, IN_PKTS, OUT_BYTES, OUT_PKTS, TCP_FLAGS, CLIENT_TCP_FLAGS, SERVER_TCP_FLAGS, FLOW_DURATION_MILLISECONDS, DURATION_IN, DURATION_OUT, MIN_TTL, MAX_TTL, LONGEST_FLOW_PKT, SHORTEST_FLOW_PKT, MIN_IP_PKT_LEN, MAX_IP_PKT_LEN, SRC_TO_DST_SECOND_BYTES, DST_TO_SRC_SECOND_BYTES, RETRANSMITTED_IN_BYTES, RETRANSMITTED_IN_PKTS, RETRANSMITTED_OUT_BYTES, RETRANSMITTED_OUT_PKTS, SRC_TO_DST_AVG_THROUGHPUT, DST_TO_SRC_AVG_THROUGHPUT, NUM_PKTS_UP_TO_128_BYTES, NUM_PKTS_128_TO_256_BYTES, NUM_PKTS_256_TO_512_BYTES, NUM_PKTS_512_TO_1024_BYTES, NUM_PKTS_1024_TO_1514_BYTES, TCP_WIN_MAX_IN, TCP_WIN_MAX_OUT, ICMP_TYPE, ICMP_IPV4_TYPE, DNS_QUERY_ID, DNS_QUERY_TYPE, DNS_TTL_ANSWER, FTP_COMMAND_RET_CODE, attack_index, dst_subnet_index, src_subnet_index

In [None]:
from pyspark.sql import SparkSession

# Start spark session
spark = SparkSession.builder \
    .appName("Multiclass classification IoT") \
    .master("local[*]") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .getOrCreate()

# Load datasets
def load_test_data(debug=False):
    if debug:
        train_data = spark.read.parquet("ml_data_train.parquet").limit(300000)
        
    else:
        train_data = spark.read.parquet("simplified_train_data.parquet")
        
    return train_data

processed_train_data = load_test_data(debug=False)

In [None]:
from pyspark.ml.feature import PCA, VectorAssembler
from pyspark.ml import Pipeline
import pandas as pd

def PCA_loadings(pca_model, features = FEATURES, k=COMPONENTS):
    
    # Extract PCA loadings
    pca_stage = pca_model.stages[-1]
    loadings = pca_stage.pc.toArray()
    
    # Create a DataFrame for loadings
    loadings_df = pd.DataFrame(loadings, index=features, columns=[f'PC{i+1}' for i in range(k)])
    
    return loadings_df

# Run PCA on the training data
loadings_df = PCA_loadings(pca_model)

# Display PCA loadings
print(loadings_df)


ValueError: Shape of passed values is (14, 3), indices imply (8, 3)

In [None]:
# Graph PCA loadings for each feature
def plot_pca_loadings(loadings_df):
    plt.figure(figsize=(24, 24))
    sns.heatmap(loadings_df, annot=True, cmap='coolwarm')
    plt.title('PCA Loadings')
    plt.show()

plot_pca_loadings(loadings_df)

In [None]:
# Plot PCA loadings for the top 10 features
top_features = loadings_df.abs().sum(axis=1).sort_values(ascending=False).head(10).index
top_loadings_df = loadings_df.loc[top_features]
plot_pca_loadings(top_loadings_df)

In [None]:
from typing import Protocol
from pyspark.ml.feature import VectorAssembler, StandardScaler, FeatureHasher, OneHotEncoder
from pyspark.ml import Pipeline

# OneHot encoding for destination subnet since higher cardinality
dst_subnet_encoder = OneHotEncoder(inputCol="dst_subnet_index", outputCol="dst_subnet_vec")

# Source address
# Feature hashing for source subnet since lower cardinality
src_subnet_hasher = FeatureHasher(inputCols=["src_subnet_index"], outputCol="src_subnet_hashed", numFeatures=1024)

# Assemble numerical features for bytes and packets trasmitted
trasimitted_in_features = ["IN_BYTES", "IN_PKTS"]
trasmitted_out_features = ["OUT_BYTES", "OUT_PKTS"]
retransimtted_in_features = ["RETRANSMITTED_IN_BYTES", "RETRANSMITTED_IN_PKTS"]
retransimtted_out_features = ["RETRANSMITTED_OUT_BYTES", "RETRANSMITTED_OUT_PKTS"]
throughput_in_features = ["SRC_TO_DST_SECOND_BYTES", "SRC_TO_DST_AVG_THROUGHPUT"]
throughput_out_features = ["DST_TO_SRC_AVG_THROUGHPUT", "DST_TO_SRC_SECOND_BYTES"]
ttl_features = ["MIN_TTL", "MAX_TTL"]
tcp_flags_features = ["CLIENT_TCP_FLAGS", "SERVER_TCP_FLAGS", "TCP_FLAGS"]
icmp_features = ["ICMP_TYPE", "ICMP_IPV4_TYPE"]
flow_max_features = ["LONGEST_FLOW_PKT", "MAX_IP_PKT_LEN"]
flow_min_features = ["SHORTEST_FLOW_PKT", "MIN_IP_PKT_LEN"]
flow_duration_features = ["FLOW_DURATION_MILLISECONDS"]
tcp_in_features = ["TCP_WIN_MAX_IN"]
tcp_out_features = ["TCP_WIN_MAX_OUT"]
duration_in_features = ["DURATION_IN"]
duration_out_features = ["DURATION_OUT"]
protocol_features = ["PROTOCOL"]
l4_src_port_features = ["L4_SRC_PORT"]
l4_dst_port_features = ["L4_DST_PORT"]
dns_features = ["DNS_QUERY_ID", "DNS_QUERY_TYPE", "DNS_TTL_ANSWER"]
ftp_features = ["FTP_COMMAND_RET_CODE"]
l7_proto_features = ["L7_PROTO"]
num_packets_by_size_features = ["NUM_PKTS_UP_TO_128_BYTES", "NUM_PKTS_128_TO_256_BYTES", "NUM_PKTS_256_TO_512_BYTES", "NUM_PKTS_512_TO_1024_BYTES", "NUM_PKTS_1024_TO_1514_BYTES"]



# Assemble numerical features for flow duration
flow_duration_assembler = VectorAssembler(
    inputCols=["FLOW_DURATION_MILLISECONDS"],
    outputCol="flow_duration_feature"
)

tcp__assembler = VectorAssembler(
    inputCols=["TCP_WIN_MAX_IN", "TCP_WIN_MAX_OUT"],
    outputCol="tcp_win_features",
)

# Apply StandardScaler to the assembled vectors
bytes_pkts_scaler = StandardScaler(
    inputCol="bytes_pkts_features", 
    outputCol="scaled_bytes_pkts_features", 
    withStd=True, 
    withMean=True
)

flow_duration_scaler = StandardScaler(
    inputCol="flow_duration_feature", 
    outputCol="scaled_flow_duration", 
    withStd=True, 
    withMean=True
)

throughput_scaler = StandardScaler(
    inputCol="throughput_features", 
    outputCol="scaled_throughput_features", 
    withStd=True, 
    withMean=True
)

pkt_len_scaler = StandardScaler(
    inputCol="pkt_len_features", 
    outputCol="scaled_pkt_len_features", 
    withStd=True, 
    withMean=True
)

tcp_win_scaler = StandardScaler(
    inputCol="tcp_win_features", 
    outputCol="scaled_tcp_win_features", 
    withStd=True, 
    withMean=True
)

# Create a pipeline for scaling
scaling_pipeline = Pipeline(stages=[
    dst_subnet_encoder,
    src_subnet_hasher,
    bytes_pkts_assembler, 
    bytes_pkts_scaler, 
    flow_duration_assembler, 
    flow_duration_scaler, 
    throughput_assembler, 
    throughput_scaler, 
    pkt_len_assembler, 
    pkt_len_scaler, 
    tcp_win_assembler, 
    tcp_win_scaler
])

# Fit the scalling pipeline to the encoded training data
scalling_model = scaling_pipeline.fit(encoded_train_data)

# Transform both encoded training and test data
processed_train_data = scalling_model.transform(encoded_train_data)
processed_test_data = scalling_model.transform(encoded_test_data)