In [1]:
import pandas as pd

In [2]:
from google.colab import files
uploaded = files.upload()

Saving KDDTest+.txt to KDDTest+.txt
Saving KDDTrain+.txt to KDDTrain+.txt


In [3]:
train_df = pd.read_csv("KDDTrain+.txt", header=None)
test_df = pd.read_csv("KDDTest+.txt", header=None)

# Inspecting the shape and first few rows
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.head()

Train shape: (125973, 43)
Test shape: (22544, 43)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [4]:
# Dropping the last column (difficulty) from train and test
train_df = train_df.iloc[:, :-1]
test_df = test_df.iloc[:, :-1]

print("Train shape after dropping difficulty:", train_df.shape)
print("Test shape after dropping difficulty:", test_df.shape)

Train shape after dropping difficulty: (125973, 42)
Test shape after dropping difficulty: (22544, 42)


In [5]:
#Separate features (X) and labels (y)

# For training data
X_train = train_df.iloc[:, :-1]   # all columns except last
y_train = train_df.iloc[:, -1]    # last column = label

# For testing data
X_test = test_df.iloc[:, :-1]     # all columns except last
y_test = test_df.iloc[:, -1]      # last column = label

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

print("\nSample of y_train values:")
print(y_train.head())

X_train shape: (125973, 41)
y_train shape: (125973,)
X_test shape: (22544, 41)
y_test shape: (22544,)

Sample of y_train values:
0     normal
1     normal
2    neptune
3     normal
4     normal
Name: 41, dtype: object


In [6]:
# Converting labels to binary: 0 = normal, 1 = attack
y_train_bin = y_train.apply(lambda v: 0 if v == "normal" else 1)
y_test_bin  = y_test.apply(lambda v: 0 if v == "normal" else 1)

print("Binary labels created successfully!")
print("First 10 labels:", y_train_bin.head(10))

Binary labels created successfully!
First 10 labels: 0    0
1    0
2    1
3    0
4    0
5    1
6    1
7    1
8    1
9    1
Name: 41, dtype: int64


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Defining which columns are categorical (string)
categorical_cols = [1, 2, 3]

# All other columns are numeric
numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

# Building preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [8]:
from sklearn.linear_model import LogisticRegression

# Building the full pipeline: preprocessing + model
log_reg_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", LogisticRegression(max_iter=1000))
])

# Training (fitting) the model on the training data
log_reg_model.fit(X_train, y_train_bin)

print("Logistic Regression model training complete!")

Logistic Regression model training complete!


In [9]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predicting on the test set
y_pred_log = log_reg_model.predict(X_test)

# Accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test_bin, y_pred_log))

# Detailed precision, recall, F1-score
print("\nClassification Report:")
print(classification_report(y_test_bin, y_pred_log))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_bin, y_pred_log))

Logistic Regression Accuracy: 0.7536816891412349

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.93      0.76      9711
           1       0.92      0.62      0.74     12833

    accuracy                           0.75     22544
   macro avg       0.78      0.77      0.75     22544
weighted avg       0.80      0.75      0.75     22544


Confusion Matrix:
[[8991  720]
 [4833 8000]]


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Building pipeline: preprocessing + Random Forest
rf_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200,      # number of trees
        random_state=42,       # reproducibility
        n_jobs=-1              # use all CPU cores
    ))
])

# Training the model
rf_model.fit(X_train, y_train_bin)

print("Random Forest training complete!")

Random Forest training complete!


In [11]:
# Predictions on test set
y_pred_rf = rf_model.predict(X_test)

# Accuracy
print("Random Forest Accuracy:", accuracy_score(y_test_bin, y_pred_rf))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test_bin, y_pred_rf))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_bin, y_pred_rf))

Random Forest Accuracy: 0.7676543647977289

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.97      0.78      9711
           1       0.97      0.61      0.75     12833

    accuracy                           0.77     22544
   macro avg       0.81      0.79      0.77     22544
weighted avg       0.83      0.77      0.76     22544


Confusion Matrix:
[[9448  263]
 [4975 7858]]


In [12]:
!pip install xgboost



In [13]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
xgb_model = Pipeline(steps=[
    ("preprocess", preprocessor),          # same preprocessing as before
    ("clf", XGBClassifier(
        n_estimators=300,                  # number of trees
        max_depth=6,                       # depth of each tree
        learning_rate=0.1,                 # how fast it learns
        subsample=0.8,                     # row sampling (for regularization)
        colsample_bytree=0.8,              # column sampling per tree
        objective="binary:logistic",       # binary classification
        n_jobs=-1,                         # use all CPU cores
        eval_metric="logloss",             # avoids warnings
        tree_method="hist"                 # fast histogram algorithm
    ))
])

In [15]:
xgb_model.fit(X_train, y_train_bin)

print("XGBoost training complete!")

XGBoost training complete!


In [16]:
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test_bin, y_pred_xgb))

print("\nClassification Report:")
print(classification_report(y_test_bin, y_pred_xgb))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_bin, y_pred_xgb))


XGBoost Accuracy: 0.792805180979418

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.97      0.80      9711
           1       0.97      0.66      0.78     12833

    accuracy                           0.79     22544
   macro avg       0.83      0.81      0.79     22544
weighted avg       0.85      0.79      0.79     22544


Confusion Matrix:
[[9446  265]
 [4406 8427]]
