In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
col_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted",
    "num_root", "num_file_creations", "num_shells", "num_access_files",
    "num_outbound_cmds", "is_host_login", "is_guest_login", "count",
    "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate",
    "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
    "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
    "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "label", "difficulty_level"
]

In [3]:
train_df = pd.read_csv(r"C:\Users\sande\Downloads\ids\nsl_kdd\KDDTrain+.txt", header=None)
train_df.columns = col_names


In [4]:
train_df.head(100)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,tcp,smtp,SF,3065,331,0,0,0,0,...,0.48,0.02,0.00,0.02,0.01,0.02,0.00,0.00,normal,21
96,0,udp,other,SF,102,102,0,0,0,0,...,0.14,0.29,0.14,0.00,0.00,0.00,0.00,0.00,normal,21
97,0,tcp,http,SF,259,750,0,0,0,0,...,1.00,0.00,0.25,0.03,0.00,0.00,0.00,0.00,normal,21
98,1082,udp,other,SF,147,105,0,0,0,0,...,0.01,0.42,0.86,0.00,0.00,0.00,0.00,0.00,normal,21


In [5]:
test_df = pd.read_csv(r"C:\Users\sande\Downloads\ids\nsl_kdd\KDDTest+.txt", header=None)
test_df.columns = col_names

In [6]:
train_df = train_df.drop('difficulty_level', axis=1)
test_df = test_df.drop('difficulty_level', axis=1)


In [7]:
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x == 'normal' else 1)


In [8]:
# Combine train and test
combined_df = pd.concat([train_df, test_df], axis=0)

# Ensure categorical columns are all strings
categorical_cols = ['protocol_type', 'service', 'flag']
for col in categorical_cols:
    combined_df[col] = combined_df[col].astype(str)  # ✅ convert to string
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])

# Split back
train_df = combined_df.iloc[:len(train_df)].copy()
test_df = combined_df.iloc[len(train_df):].copy()


In [9]:
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)


In [12]:
y_pred = model.predict(X_test_scaled)

In [15]:
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))
print("\n✅ Accuracy Score:", accuracy_score(y_test, y_pred))


📊 Confusion Matrix:
 [[9070  641]
 [4908 7925]]

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.93      0.77      9711
           1       0.93      0.62      0.74     12833

    accuracy                           0.75     22544
   macro avg       0.79      0.78      0.75     22544
weighted avg       0.81      0.75      0.75     22544


✅ Accuracy Score: 0.7538591199432222


In [16]:
test_df.shape

(22544, 42)

In [17]:
train_df.shape

(125973, 42)