# Data Preprocessing

In [12]:
import pandas as pd
df_basic = pd.read_csv('https://drive.google.com/uc?id=1gy8_hvjKhrvEr4DYUnUCsOKikTtTL958')
df_labels = pd.read_csv('https://drive.google.com/uc?id=1hvb0tzApLLYnEOJKAXZ9KTCXvTc98Yc3')
df_additional = pd.read_csv('https://drive.google.com/uc?id=1bGbCYKW2z9RRe_707guVVfAq9bw1ZguA')
df_time = pd.read_csv('https://drive.google.com/uc?id=1gNOtvhLHChmMDh3C0vSG6o7AcztKfyhB')
df_content = pd.read_csv('https://drive.google.com/uc?id=1frhYPKIe25rEI32AcXE5xuCJLeB1dYHH')
df_flow = pd.read_csv('https://drive.google.com/uc?id=11sc-WCfuQccSSxU_M2DbP0skMCniP4Nq')

df_data = pd.read_csv('https://drive.google.com/uc?id=1VkdsFILm2KiHRVLF8KlrDZzR33LNRurp')

df_combined = pd.merge(df_basic, df_additional, on='id')
df_combined = pd.merge(df_combined, df_labels, on='id')
df_combined = pd.merge(df_combined, df_content, on='id')
df_combined = pd.merge(df_combined, df_time, on='id')

## Feature Scaling

using Z Score with StandardScaler library

In [13]:
%pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [14]:
from imblearn.pipeline import Pipeline  # Import from imbalanced-learn
from sklearn.ensemble import RandomForestClassifier  # Example estimator
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Feature Encoding

In [15]:


# Custom One-Hot Encoder for categorical features
class CustomOneHotEncoder(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        self.categorical_columns = X.select_dtypes(include=['object']).columns
        self.dummies = pd.get_dummies(X[self.categorical_columns])
        return self

    def transform(self, X):
        dummies = pd.get_dummies(X[self.categorical_columns])
        X = X.drop(self.categorical_columns, axis=1)
        return pd.concat([X, dummies], axis=1)

## Imbalanced Datasets

Using SMOTE library

In [16]:
from imblearn.over_sampling import SMOTE

## Dimensionality Reduction

Using PCA library

In [17]:
from sklearn.decomposition import PCA

## Compile Preprocessing Pipeline 

In [27]:

pipeline = Pipeline([
    # ('encode', CustomOneHotEncoder()),
    ('scaler', StandardScaler()),
    ('smote', SMOTE()),  # SMOTE for handling class imbalance
    # ('pca', PCA(n_components=8)),
    ('classifier', RandomForestClassifier())  # Example classifier
])

In [29]:
from sklearn.model_selection import train_test_split
df_combined = df_combined[['state', 'sbytes', 'sttl', 'service', 'swin', 'dwin', 'ct_srv_src', 'ct_srv_dst', 'attack_cat']].sample(frac=1)

from sklearn.preprocessing import LabelEncoder

categorical_columns = df_combined.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()
for col in categorical_columns:
    df_combined[col] = label_encoder.fit_transform(df_combined[col].astype(str))

X = df_combined.drop(columns=['attack_cat'])
label = df_combined['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.2, shuffle=False)

pipeline.fit(X_train, y_train)

# X_train_transformed = pipeline.named_steps['encode'].transform(X_train)  # One-Hot Encoding
X_train_scaled = pipeline.named_steps['scaler'].transform(X_train)  # Scaling
X_train_resampled, y_train_resampled = pipeline.named_steps['smote'].fit_resample(X_train_scaled, y_train)  # SMOTE
# X_train_pca = pipeline.named_steps['pca'].transform(X_train_scaled)  

transformed_data = pd.DataFrame(X_train_resampled)
transformed_data['target'] = y_train_resampled

# Step 3: Save to a CSV
transformed_data.to_csv('transformed_data.csv', index=False)
transformed_data

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,-2.716031,-0.047410,-1.443385,-0.702171,-0.915863,-0.906808,-0.215480,-0.474149,6
1,-0.410019,0.004017,-1.443385,4.077029,1.091914,1.102781,-0.776221,-0.753169,6
2,0.742986,-0.050053,0.723094,0.166774,-0.915863,-0.906808,1.466746,1.478989,5
3,-0.410019,-0.032189,-1.443385,-0.702171,1.091914,1.102781,-0.495850,-0.288136,6
4,0.742986,-0.050263,0.723094,-0.702171,-0.915863,-0.906808,-0.682764,-0.195129,4
...,...,...,...,...,...,...,...,...,...
447215,-0.410019,-0.043262,0.723094,1.470192,1.091914,1.102781,-0.682764,-0.660162,9
447216,-0.410019,-0.045948,0.723094,1.470192,1.091914,1.102781,-0.682764,-0.660162,9
447217,-0.410019,-0.043145,0.723094,1.470192,1.091914,1.102781,-0.682764,-0.753169,9
447218,0.742986,-0.049042,0.723094,-0.702171,-0.915863,-0.906808,-0.631665,-0.753169,9
