### Imports

In [1]:
import pandas as pd

### Loading Dataset

In [2]:
file_path = "Datasets/kddcup.data_10_percent.gz"
file = pd.read_csv(file_path)
print(file.columns)  # Print all column names to identfy the label column

Index(['0', 'tcp', 'http', 'SF', '181', '5450', '0.1', '0.2', '0.3', '0.4',
       '0.5', '1', '0.6', '0.7', '0.8', '0.9', '0.10', '0.12', '0.13', '0.14',
       '0.15', '0.16', '8', '8.1', '0.00', '0.00.1', '0.00.2', '0.00.3',
       '1.00', '0.00.4', '0.00.5', '9', '9.1', '1.00.1', '0.00.6', '0.11',
       '0.00.7', '0.00.8', '0.00.9', '0.00.10', '0.00.11', 'normal.'],
      dtype='object')


In [3]:
# Checking the frequencis of occurences of each of the classes
print(file['normal.'].value_counts())

smurf.              280790
neptune.            107201
normal.              97277
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: normal., dtype: int64


#### Changing dataset for binary SVM 

In [8]:
# Create a mapping for the labels. Note : Smurf and Neptune are types of DOS indicating spam, whereas normal indicates normal (not spam)!
label_mapping = {
    'smurf.': -1,
    'neptune.': -1,
    'normal.': 1
}

# Making a copy to avoid SettingWithCopyWarning
filtered_file = file[file['normal.'].isin(label_mapping.keys())].copy()  

# Map the labels to -1 for smurf and neptune, and 1 for normal
filtered_file.loc[:, 'normal.'] = filtered_file['normal.'].map(label_mapping)

# Now, the 'normal.' column will contain -1 for smurf and neptune, and 1 for normal
print(filtered_file[['normal.']].value_counts())  # Display the updated labels

normal.
-1         387991
 1          97277
dtype: int64


### Training Binary SVM (Linear Kernel) : 

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [7]:
# Split the data into features (X) and labels (y)
X = filtered_file.drop(columns=['normal.'])  
y = filtered_file['normal.']

# Ensure that features are numeric for SVM
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9998866610340635

Classification Report:
               precision    recall  f1-score   support

          -1       1.00      1.00      1.00     77599
           1       1.00      1.00      1.00     19455

    accuracy                           1.00     97054
   macro avg       1.00      1.00      1.00     97054
weighted avg       1.00      1.00      1.00     97054



### Training on Multi Class SVM (Linear and RBF Kernel) : 

In [9]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

In [18]:
# Assign feature and target columns
X = file.iloc[:, :-1]  
y = file.iloc[:, -1]

# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(include=['object']).columns

# Encoding the non-numeric columns
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Save encoders for future use

# Encoding the target column
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Spliting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Training the Linear SVM model
svm_model_linear = SVC(kernel='linear', random_state=42, C=1, gamma='scale')
svm_model_linear.fit(X_train, y_train)
y_pred_linear = svm_model_linear.predict(X_test)

# Training the RBF Kernal SVM model
svm_model_rbf = SVC(kernel='rbf', random_state=42, C=1, gamma='scale')
svm_model_rbf.fit(X_train, y_train)
y_pred_rbf = svm_model_rbf.predict(X_test)

In [19]:
report_linear = classification_report(y_test, y_pred_linear, zero_division=0, output_dict=True)
report_rbf = classification_report(y_test, y_pred_rbf, zero_division=0, output_dict=True)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_linear))
report_linear = classification_report(y_test, y_pred_linear, zero_division = 0)
print("\nClassification Report for Linear Kernel :\n\n", report_linear)

print("Accuracy:", accuracy_score(y_test, y_pred_rbf))
report_rbf = classification_report(y_test, y_pred_rbf, zero_division = 0)
print("\nClassification Report for RBF Kernel :\n\n", report_rbf)

Accuracy: 0.9991903161815311

Classification Report for Linear Kernel :

               precision    recall  f1-score   support

           0       0.99      0.99      0.99       441
           1       0.83      0.83      0.83         6
           2       1.00      0.50      0.67         2
           3       1.00      0.91      0.95        11
           4       0.50      0.50      0.50         2
           5       0.99      0.97      0.98       249
           6       1.00      1.00      1.00         4
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1
           9       1.00      1.00      1.00     21440
          10       0.88      0.91      0.89        46
          11       1.00      1.00      1.00     19455
          12       1.00      1.00      1.00         1
          13       0.50      1.00      0.67         1
          14       1.00      1.00      1.00        53
          15       1.00      1.00      1.00       208
       

In [None]:
# List of kernels to test
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

# Dictionary to store reports
reports = {}

# Loop through each kernel and train the model
for kernel in kernels:
    if kernel == 'poly':
        model = SVC(kernel=kernel, degree=3, coef0=1)  # Polynomial kernel with degree 3 and coef0=1
    elif kernel == 'rbf':
        model = SVC(kernel=kernel, gamma='scale')  # RBF kernel with default gamma
    elif kernel == 'sigmoid':
        model = SVC(kernel=kernel, coef0=1)  # Sigmoid kernel with coef0=1
    else:
        model = SVC(kernel=kernel)  # Linear kernel
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Generate and store classification report
    report = classification_report(y_test, y_pred, zero_division=0)
    reports[kernel] = report

# Print classification reports for all kernels
for kernel, report in reports.items():
    print(f"\nClassification Report for {kernel.capitalize()} Kernel:\n")
    print(report)