In [None]:
#importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#loading and displaying top 5 rows from the dataset
df=pd.read_csv("/content/drive/MyDrive/Datasets/network attack/Train_Data.csv")
df.head()

Unnamed: 0,duration,protocoltype,service,flag,srcbytes,dstbytes,land,wrongfragment,urgent,hot,...,dsthostsamesrvrate,dsthostdiffsrvrate,dsthostsamesrcportrate,dsthostsrvdiffhostrate,dsthostserrorrate,dsthostsrvserrorrate,dsthostrerrorrate,dsthostsrvrerrorrate,lastflag,attack
0,0,tcp,netbios_dgm,REJ,0,0,0,0,0,0,...,0.06,0.06,0.0,0.0,0.0,0.0,1.0,1.0,21,1
1,0,tcp,smtp,SF,1239,400,0,0,0,0,...,0.45,0.04,0.0,0.0,0.11,0.0,0.02,0.0,18,0
2,0,tcp,http,SF,222,945,0,0,0,0,...,1.0,0.0,0.02,0.03,0.0,0.0,0.0,0.0,21,0
3,0,tcp,http,SF,235,1380,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,0
4,0,tcp,uucp_path,REJ,0,0,0,0,0,0,...,0.01,0.08,0.0,0.0,0.0,0.0,1.0,1.0,19,1


In [None]:
#printing column names
df.columns

Index(['duration', 'protocoltype', 'service', 'flag', 'srcbytes', 'dstbytes',
       'land', 'wrongfragment', 'urgent', 'hot', 'numfailedlogins', 'loggedin',
       'numcompromised', 'rootshell', 'suattempted', 'numroot',
       'numfilecreations', 'numshells', 'numaccessfiles', 'numoutboundcmds',
       'ishostlogin', 'isguestlogin', 'count', 'srvcount', 'serrorrate',
       'srvserrorrate', 'rerrorrate', 'srvrerrorrate', 'samesrvrate',
       'diffsrvrate', 'srvdiffhostrate', 'dsthostcount', 'dsthostsrvcount',
       'dsthostsamesrvrate', 'dsthostdiffsrvrate', 'dsthostsamesrcportrate',
       'dsthostsrvdiffhostrate', 'dsthostserrorrate', 'dsthostsrvserrorrate',
       'dsthostrerrorrate', 'dsthostsrvrerrorrate', 'lastflag', 'attack'],
      dtype='object')

In [None]:
#checking for null values
df.isna().sum()

duration                  0
protocoltype              0
service                   0
flag                      0
srcbytes                  0
dstbytes                  0
land                      0
wrongfragment             0
urgent                    0
hot                       0
numfailedlogins           0
loggedin                  0
numcompromised            0
rootshell                 0
suattempted               0
numroot                   0
numfilecreations          0
numshells                 0
numaccessfiles            0
numoutboundcmds           0
ishostlogin               0
isguestlogin              0
count                     0
srvcount                  0
serrorrate                0
srvserrorrate             0
rerrorrate                0
srvrerrorrate             0
samesrvrate               0
diffsrvrate               0
srvdiffhostrate           0
dsthostcount              0
dsthostsrvcount           0
dsthostsamesrvrate        0
dsthostdiffsrvrate        0
dsthostsamesrcportra

In [None]:
# Separate features and target
X = df.drop(columns=['attack'])
y = df['attack']

In [None]:
# Identify categorical and numerical columns
categorical_cols = ['protocoltype', 'flag', 'service']
numerical_cols = X.columns.difference(categorical_cols)

In [None]:
# Preprocessing pipeline for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)])

In [None]:
# Fit the preprocessor and transform the data
X_transformed = preprocessor.fit_transform(X)

# Get the feature names after transformation
num_feature_names = numerical_cols.tolist()
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = num_feature_names + cat_feature_names.tolist()

In [None]:
# Convert the transformed data back into a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=all_feature_names)

# Display the transformed data
print(X_transformed_df.head())

      count  diffsrvrate  dstbytes  dsthostcount  dsthostdiffsrvrate  \
0  2.013053     0.120888 -0.049289      0.719026            0.094988   
1 -0.801335    -0.370241 -0.042073      0.719026           -0.098581   
2 -0.597394    -0.370241 -0.032241     -1.344626           -0.485721   
3 -0.770744    -0.370241 -0.024393      0.719026           -0.485721   
4  1.798915     0.120888 -0.049289      0.719026            0.288558   

   dsthostrerrorrate  dsthostsamesrcportrate  dsthostsamesrvrate  \
0            3.20975               -0.363793           -1.042709   
1           -0.25601               -0.363793           -0.170910   
2           -0.32674               -0.267818            1.058551   
3           -0.32674               -0.363793            1.058551   
4            3.20975               -0.363793           -1.154478   

   dsthostserrorrate  dsthostsrvcount  ...  service_telnet  service_tftp_u  \
0          -0.699276        -0.961269  ...             0.0             0.0   
1 

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the model pipeline with LogisticRegression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(random_state=42, max_iter=1000))])

In [None]:
# Train the model
model.fit(X_train, y_train)

In [None]:
# Predict on the training set
y_train_pred = model.predict(X_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)

In [None]:
# Calculate F1 scores
f1_train = f1_score(y_train, y_train_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

print(f'Training F1 Score: {f1_train}')
print(f'Validation F1 Score: {f1_val}')

Training F1 Score: 0.9999856064875541
Validation F1 Score: 0.9999424270024083


In [None]:
if f1_val > f1_train:
  print("Model might be overfitting")
else:
  print("Model generalizes well")

Model generalizes well


In [None]:
# Load the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/Datasets/network attack/Test_Data.csv')

# Predict the target column for the test data
y_test_pred = model.predict(test_data)

# Create a DataFrame for the predictions
test_predictions = pd.DataFrame({'attack': y_test_pred})

# Save the predictions to a CSV file
test_predictions.to_csv('/content/drive/MyDrive/Datasets/network attack/Test_Predictions.csv', index=False)

print('Predictions saved to Test_Predictions.csv')

Predictions saved to Test_Predictions.csv
