In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
tfds.list_builders()
dataset_name = 'kddcup99'
dataset, info = tfds.load(dataset_name, with_info=True)
print(info)

Downloading and preparing dataset 18.62 MiB (download: 18.62 MiB, generated: 5.25 GiB, total: 5.27 GiB) to /root/tensorflow_datasets/kddcup99/1.0.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

In [None]:
import pandas as pd
train_df = tfds.as_dataframe(dataset['train'])
test_df = tfds.as_dataframe(dataset['test'])
print(test_df.columns)
print(test_df.describe())
print(test_df['protocol_type'].value_counts())

Index(['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count',
       'dst_host_diff_srv_rate', 'dst_host_rerror_rate',
       'dst_host_same_src_port_rate', 'dst_host_same_srv_rate',
       'dst_host_serror_rate', 'dst_host_srv_count',
       'dst_host_srv_diff_host_rate', 'dst_host_srv_rerror_rate',
       'dst_host_srv_serror_rate', 'duration', 'flag', 'hot', 'is_guest_login',
       'is_hot_login', 'label', 'land', 'logged_in', 'num_access_files',
       'num_compromised', 'num_failed_logins', 'num_file_creations',
       'num_outbound_cmds', 'num_root', 'num_shells', 'protocol_type',
       'rerror_rate', 'root_shell', 'same_srv_rate', 'serror_rate', 'service',
       'src_bytes', 'srv_count', 'srv_diff_host_rate', 'srv_rerror_rate',
       'srv_serror_rate', 'su_attempted', 'urgent', 'wrong_fragment'],
      dtype='object')
               count  diff_srv_rate     dst_bytes  dst_host_count  \
count  311029.000000  311029.000000  3.110290e+05   311029.000000   
mean      269.24701

In [None]:
for column in train_df.columns:
    unique_values = train_df[column].nunique()
    data_type = train_df[column].dtype
    print(f"Column: {column}, Unique Values: {unique_values}, Data Type: {data_type}")

Column: count, Unique Values: 502, Data Type: int32
Column: diff_srv_rate, Unique Values: 100, Data Type: float32
Column: dst_bytes, Unique Values: 9202, Data Type: int32
Column: dst_host_count, Unique Values: 256, Data Type: int32
Column: dst_host_diff_srv_rate, Unique Values: 101, Data Type: float32
Column: dst_host_rerror_rate, Unique Values: 101, Data Type: float32
Column: dst_host_same_src_port_rate, Unique Values: 101, Data Type: float32
Column: dst_host_same_srv_rate, Unique Values: 101, Data Type: float32
Column: dst_host_serror_rate, Unique Values: 100, Data Type: float32
Column: dst_host_srv_count, Unique Values: 256, Data Type: int32
Column: dst_host_srv_diff_host_rate, Unique Values: 58, Data Type: float32
Column: dst_host_srv_rerror_rate, Unique Values: 100, Data Type: float32
Column: dst_host_srv_serror_rate, Unique Values: 101, Data Type: float32
Column: duration, Unique Values: 745, Data Type: int32
Column: flag, Unique Values: 11, Data Type: int64
Column: hot, Unique V

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Set the target column based on your exploration
target_column_name = "label"

# Separate features (X) and labels (y)
X_train = train_df.drop(target_column_name, axis=1)
y_train = train_df[target_column_name]
X_test = test_df.drop(target_column_name, axis=1)
y_test = test_df[target_column_name]

# If your labels are categorical, you may need to encode them using LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create an SVM classifier with a suitable kernel for multi-class classification
svm_classifier = SVC(kernel='linear', C=1.0, decision_function_shape='ovr', random_state=42)

# Fit the SVM classifier to the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the validation data
y_val_pred = svm_classifier.predict(X_val)

# Evaluate the model on the validation data
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

# Make predictions on the test data
y_test_pred = svm_classifier.predict(X_test)

# Evaluate the model on the test data
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy_test:.2f}")