In [None]:
# %pip install pandas
# %pip install numpy matplotlib

In [None]:
# %pip install seaborn

In [None]:
# Define column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag',
    'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
    'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
    'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class', 'difficulty'
]

In [None]:
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

# Load the training data
train_df = pd.read_csv(
    r"C:\Users\yasha\Downloads\nsl-kdd\KDDTrain+.txt",
    header=None,
    names=column_names
)

# Drop the 'difficulty' column
train_df.drop(['difficulty'], axis=1, inplace=True)

# Load the testing data
test_df = pd.read_csv(
    r"C:\Users\yasha\Downloads\nsl-kdd\KDDTest+.txt",
    header=None,
    names=column_names
)

# Drop the 'difficulty' column
test_df.drop(['difficulty'], axis=1, inplace=True)

# Print basic info
print("Training Data")
print(f"Shape: {train_df.shape}")
print(train_df.head())

print("\n--- Testing Data ---")
print(f"Shape: {test_df.shape}")
print(test_df.head())


In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df['class']

In [None]:
train_df[['protocol_type','class']]

In [None]:
train_df['class'].value_counts()

In [None]:
# Create a binary classification target
train_df['attack_binary'] = train_df['class'].apply(lambda x: 0 if x == 'normal' else 1)
test_df['attack_binary']  = test_df['class'].apply(lambda x: 0 if x == 'normal' else 1)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.countplot(x='attack_binary', data=train_df, palette='Set1')
plt.title('Binary Attack Distribution')
plt.xlabel('0 = Normal, 1 = Attack')
plt.ylabel('Count')
plt.show()

print(train_df['attack_binary'].value_counts(normalize=True))


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='protocol_type', hue='attack_binary', data=train_df, palette='Set2')
plt.title('Protocol Type vs Attack Binary')
plt.show()

print(train_df['protocol_type'].value_counts())
print('\n',train_df.groupby(['protocol_type', 'attack_binary']).size())


In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='flag', hue='attack_binary', data=train_df, palette='pastel')
plt.title('Connection Flag vs Attack Binary')
plt.show()

print(train_df['flag'].value_counts())


In [None]:
top_services = train_df['service'].value_counts().nlargest(20).index
plt.figure(figsize=(12,6))
sns.countplot(y='service', data=train_df[train_df['service'].isin(top_services)],
              order=top_services, hue='attack_binary', palette='Set3')
plt.title('Top 20 Services vs Attack Binary')
plt.xlabel('Count')
plt.ylabel('Service')
plt.show()


In [None]:
attack_df = train_df[train_df['attack_binary'] == 1]

plt.figure(figsize=(15, 8))
sns.countplot(y='class', data=attack_df, order=attack_df['class'].value_counts().index, palette='viridis')
plt.title("Distribution of Different Attack Types", fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Attack Type', fontsize=12)
plt.xscale('log')  # log scale for better visibility
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.show()

In [None]:
# Create 3 side-by-side subplots
fig, axes = plt.subplots(1, 3, figsize=(22, 6))

# Plot 1: Protocol Type Distribution
sns.countplot(ax=axes[0], x='protocol_type', data=train_df, palette='magma')
axes[0].set_title('Protocol Type Distribution')

# Plot 2: Top 10 Service Types
sns.countplot(
    ax=axes[1],
    x='service',
    data=train_df,
    order=train_df['service'].value_counts().iloc[:10].index,
    palette='plasma'
)
axes[1].set_title('Top 10 Service Types')
axes[1].tick_params(axis='x', rotation=45)

# Plot 3: Flag Distribution
sns.countplot(
    ax=axes[2],
    x='flag',
    data=train_df,
    order=train_df['flag'].value_counts().index,
    palette='cividis'
)
axes[2].set_title('Flag Distribution')

# Adjust layout and display
plt.tight_layout()
plt.show()


In [None]:
# Separate features (X) and target (y)
X_train_raw = train_df.drop(['class', 'attack_binary'], axis=1)
y_train = train_df['attack_binary']

X_test_raw = test_df.drop(['class', 'attack_binary'], axis=1)
y_test = test_df['attack_binary']

# Identify categorical and numerical columns
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = X_train_raw.columns.drop(categorical_cols)

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols.tolist())


In [None]:
# One-Hot Encode categorical features
X_train_encoded = pd.get_dummies(X_train_raw, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_raw, columns=categorical_cols, drop_first=True)

# Align columns to ensure test set has same features as train set
train_cols = X_train_encoded.columns
test_cols = X_test_encoded.columns

# Add missing columns in test set
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test_encoded[c] = 0

# Add missing columns in train set
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X_train_encoded[c] = 0

# To ensure same order in both train and test sets
X_test_encoded = X_test_encoded[train_cols]

print(f"Shape of training data after encoding: {X_train_encoded.shape}")
print(f"Shape of testing data after encoding: {X_test_encoded.shape}")


In [None]:
# %pip install scikit-learn

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify the new numerical columns (original ones)
numerical_cols_to_scale = numerical_cols  

# Fit on training data and transform both train and test data
X_train_scaled = X_train_encoded.copy()
X_test_scaled = X_test_encoded.copy()

X_train_scaled[numerical_cols_to_scale] = StandardScaler().fit_transform(X_train_encoded[numerical_cols_to_scale])
X_test_scaled[numerical_cols_to_scale] = StandardScaler().fit(X_train_encoded[numerical_cols_to_scale]).transform(X_test_encoded[numerical_cols_to_scale])

print("\n--- Scaled Training Data Head ---")
print(X_train_scaled.head())




--- Scaled Training Data Head ---
   duration  src_bytes  dst_bytes      land  wrong_fragment    urgent  \
0 -0.110249  -0.007679  -0.004919 -0.014089       -0.089486 -0.007736   
1 -0.110249  -0.007737  -0.004919 -0.014089       -0.089486 -0.007736   
2 -0.110249  -0.007762  -0.004919 -0.014089       -0.089486 -0.007736   
3 -0.110249  -0.007723  -0.002891 -0.014089       -0.089486 -0.007736   
4 -0.110249  -0.007728  -0.004814 -0.014089       -0.089486 -0.007736   

        hot  num_failed_logins  logged_in  num_compromised  ...  flag_REJ  \
0 -0.095076          -0.027023  -0.809262        -0.011664  ...     False   
1 -0.095076          -0.027023  -0.809262        -0.011664  ...     False   
2 -0.095076          -0.027023  -0.809262        -0.011664  ...     False   
3 -0.095076          -0.027023   1.235694        -0.011664  ...     False   
4 -0.095076          -0.027023   1.235694        -0.011664  ...     False   

   flag_RSTO  flag_RSTOS0  flag_RSTR  flag_S0  flag_S1  flag_S2