# Iris Data

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

data = load_iris()
iris_df = pd.DataFrame(data.data, columns=data.feature_names)
iris_df['target'] = data.target


In [2]:
iris_df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
iris = iris_df.iloc[: , :-1]
iris.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
from sklearn.preprocessing import MinMaxScaler
from minisom import MiniSom

# Normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(iris_df.iloc[:, :-1])

# SOM initialization
som = MiniSom(x=10, y=10, input_len=4, sigma=1.0, learning_rate=0.5)
som.random_weights_init(normalized_data)
som.train_random(normalized_data, 100)


Calculating BMU Distance

In [5]:
import numpy as np

def calculate_bmu_distances(som, data):
    return [np.linalg.norm(sample - som.get_weights()[som.winner(sample)])
            for sample in data]

distances = calculate_bmu_distances(som, normalized_data)
iris_df['bmu_distance'] = distances


In [6]:
from scipy.stats import zscore

iris_df['z_score'] = zscore(distances)
anomalies = iris_df[iris_df['z_score'].abs() > 2]


Anomalies

In [7]:
print(len(anomalies))
anomalies


4


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,bmu_distance,z_score
100,6.3,3.3,6.0,2.5,2,0.268433,2.953401
114,5.8,2.8,5.1,2.4,2,0.287139,3.260047
117,7.7,3.8,6.7,2.2,2,0.325413,3.8875
131,7.9,3.8,6.4,2.0,2,0.340708,4.138227


# KDD Cup Dataset

In [8]:
import pandas as pd
import numpy as np

COLUMNS = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
]

def load_and_preprocess_kdd(file_path):
        df = pd.read_csv(
            file_path,
            header=None,
            names=COLUMNS,
            dtype={
                'protocol_type': 'category',
                'service': 'category',
                'flag': 'category',
            }
        )

        print("Data loaded successfully. First 5 rows:")
        print(df.head())

        categorical_cols = df.select_dtypes(include=['category']).columns

        print()
        print("Categorical column analysis:")
        for col in categorical_cols:
            unique_vals = df[col].unique()
            print(f"{col}: {len(unique_vals)} unique values")
            print(f"Sample values: {list(unique_vals[:3])}")
            print()

        # Vectorization strategy
        for col in categorical_cols:

            # One-hot encode
            if len(df[col].unique()) < 10:
                df = pd.concat([
                    df.drop(col, axis=1),
                    pd.get_dummies(df[col], prefix=col)
                ], axis=1)
            else:
                # Label encode
                df[col] = df[col].astype('category').cat.codes

        num_cols = df.select_dtypes(include=np.number).columns
        df[num_cols] = df[num_cols].astype('float32')

        return df


In [9]:
features = load_and_preprocess_kdd('./kddcup.testdata.unlabeled_10_percent')


Data loaded successfully. First 5 rows:
   duration protocol_type  service flag  src_bytes  dst_bytes  land  \
0         0           udp  private   SF        105        146     0   
1         0           udp  private   SF        105        146     0   
2         0           udp  private   SF        105        146     0   
3         0           udp  private   SF        105        146     0   
4         0           udp  private   SF        105        146     0   

   wrong_fragment  urgent  hot  ...  dst_host_count  dst_host_srv_count  \
0               0       0    0  ...             255                 254   
1               0       0    0  ...             255                 254   
2               0       0    0  ...             255                 254   
3               0       0    0  ...             255                 254   
4               0       0    0  ...             255                 254   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     1.0    

In [10]:
if features is not None:
    print()
    print("Processed features shape:", features.shape)
    print("Sample preprocessed data:")
    print(features.iloc[:3, :5])



Processed features shape: (311029, 43)
Sample preprocessed data:
   duration  service  flag  src_bytes  dst_bytes
0       0.0     12.0   5.0      105.0      146.0
1       0.0     12.0   5.0      105.0      146.0
2       0.0     12.0   5.0      105.0      146.0


In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
som_input = scaler.fit_transform(features)


In [12]:
# SOM initialization
som = MiniSom(x=50, y=50, input_len=som_input.shape[1], sigma=1.0, learning_rate=0.5)
som.random_weights_init(som_input)
som.train_random(som_input, 1000)


Calculating BMU Distance

In [13]:
import numpy as np

def calculate_bmu_distances(som, data):
    return [np.linalg.norm(sample - som.get_weights()[som.winner(sample)])
            for sample in data]

distances = calculate_bmu_distances(som, som_input)
features['bmu_distance'] = distances


In [14]:
from scipy.stats import zscore

features['z_score'] = zscore(distances)
anomalies = features[features['z_score'].abs() > 2]


Anomalies

In [15]:
print(len(anomalies))
print(anomalies)


9311
        duration  service  flag  src_bytes  dst_bytes  land  wrong_fragment  \
6            0.0      4.0   5.0       29.0        0.0   0.0             0.0   
37           1.0     13.0   5.0     1661.0      330.0   0.0             0.0   
38          20.0      8.0   5.0      232.0      765.0   0.0             0.0   
44           0.0      9.0   5.0      615.0        0.0   0.0             0.0   
71           1.0     13.0   5.0     1018.0      333.0   0.0             0.0   
...          ...      ...   ...        ...        ...   ...             ...   
310139       0.0      4.0   5.0       44.0       44.0   0.0             0.0   
310140       0.0      4.0   5.0       44.0       44.0   0.0             0.0   
310141       0.0      4.0   5.0       45.0       45.0   0.0             0.0   
310186       0.0      4.0   5.0       45.0      127.0   0.0             0.0   
310907       0.0     12.0   5.0      105.0      147.0   0.0             0.0   

        urgent  hot  num_failed_logins  ...  d