# Preprocessing the UNSW-NB15 Dataset

In this part, the preprocessing of the dataset is performed. The steps followed in this part are as follows:

## Steps

1. **Loading the Dataset**
   - Load the UNSW-NB15 dataset from a CSV file into a pandas DataFrame for further processing.

2. **Encoding Categorical Features**
   - Convert categorical features into numerical values using label encoding. This is necessary for machine learning algorithms that require numerical input.

3. **Scaling Numerical Features**
   - Standardize the numerical features by scaling them to have zero mean and unit variance. This helps in improving the performance of machine learning models.

4. **Splitting the Dataset into Training and Testing Sets**
   - Split the dataset into training and testing sets to evaluate the performance of machine learning models. Typically, 80% of the data is used for training and 20% for testing.


In [1]:
# Core data manipulation and analysis libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations and arrays

import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Enable inline plotting in Jupyter notebooks
# Fixed duplicate import and invalid syntax
%matplotlib inline



```

Load the Cleaned Dataset.

```

In [2]:
    
# Reading datasets
# Using list comprehension to read all csv files in 4 csv files
df = pd.read_csv('C:/Users/raman/OneDrive/Important/1UnisaSTUDY/Courses/Capstone_Project_1/Github/Code Working/Data Cleaning and EDA/Cleaned_full_data.csv', header=0) 

df.head()


Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0.0,3,7,1,3,1,1,1,Normal,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0.0,2,4,2,3,1,1,2,Normal,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0.0,12,8,1,2,2,1,1,Normal,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0.0,6,9,1,1,1,1,1,Normal,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0.0,7,9,1,1,1,1,1,Normal,0


In [3]:
# Checking Full data is avaliable or not
df.shape

(2540047, 49)

 **Encoding Categorical Features**
   - Convert categorical features into numerical values using label encoding. This is necessary for machine learning algorithms that require numerical input.

In [4]:
# Function to encode categorical features
# Label Encoding: Converts categories to numerical values (0,1,2...). Good for ordinal data but can imply ordering
# One-Hot Encoding: Creates binary columns for each category. Better for nominal data with no inherent order
def encode_categorical_features(df, categorical_features, encoding_type='label'):
    """
    Args:
        df: Input dataframe
        categorical_features: List of categorical column names
        encoding_type: 'label' for LabelEncoder or 'onehot' for OneHotEncoder
    """
    if encoding_type == 'label':
        label_encoders = {}
        for column in categorical_features:
            label_encoders[column] = LabelEncoder()
            df[column] = label_encoders[column].fit_transform(df[column].astype(str))
        return df, label_encoders
    
    elif encoding_type == 'onehot':
        onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoded_array = onehot.fit_transform(df[categorical_features])
        
        # Create new column names for one-hot encoded features
        new_columns = []
        for i, feature in enumerate(categorical_features):
            categories = onehot.categories_[i]
            new_columns.extend([f"{feature}_{cat}" for cat in categories])
            
        # Create new dataframe with encoded features
        encoded_df = pd.DataFrame(encoded_array, columns=new_columns, index=df.index)
        
        # Drop original categorical columns and concat encoded ones
        df = df.drop(columns=categorical_features)
        df = pd.concat([df, encoded_df], axis=1)
        return df, onehot

**Scaling Numerical Features**
   - Standardize the numerical features by scaling them to have zero mean and unit variance. This helps in improving the performance of machine learning models.

In [5]:
# Function to scale numerical features
def scale_numerical_features(df, numerical_features):
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df, scaler

**Splitting the Dataset into Training and Testing Sets**
   - Use `train_test_split` from the `sklearn.model_selection` module to split the dataset into training and testing sets.

In [6]:
# Function to split the dataset into training and testing sets
def split_dataset(df, target_column, test_size=0.2, random_state=42):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [7]:

# Main function to preprocess the dataset
def preprocess_unsw_nb15(df, target_column, categorical_features, numerical_features, encoding_type='label'):
    df, encoders = encode_categorical_features(df, categorical_features, encoding_type)
    df, scaler = scale_numerical_features(df, numerical_features)
    X_train, X_test, y_train, y_test = split_dataset(df, target_column)
    return X_train, X_test, y_train, y_test, encoders, scaler

if __name__ == "__main__":
    # Assuming df is already loaded and categorical and numerical features are identified
    target_column = 'label'  # Update with the correct target column

    # Identify categorical and numerical columns
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    numerical_features = df.select_dtypes(include=['number']).columns.tolist()


    # Preprocessing
    encoding_type = 'label'  # Change to 'onehot' for One-Hot Encoding
    X_train, X_test, y_train, y_test, encoders, scaler = preprocess_unsw_nb15(df, target_column, categorical_features, numerical_features, encoding_type)
    print("Preprocessing complete.")
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")

Preprocessing complete.
Training set size: (2032037, 48)
Test set size: (508010, 48)


In [8]:
# Print head values
print("\nTraining set head:")
print(X_train.head())
print(y_train.head())

print("\nTest set head:")
print(X_test.head())
print(y_test.head())


Training set head:
         srcip  sport  dstip  dsport  proto  state       dur    sbytes  \
596843      38  49373     27   54769    114      5 -0.046008 -0.031869   
593275      42  30709     25   37620    114      5 -0.045501 -0.012864   
1254949     35  42244     26   35345    114      5 -0.046083 -0.028181   
191973      34  42875     21   51807    120      2 -0.047183 -0.067574   
2185322     35  38677     21   51652    114      5 -0.046969 -0.011694   

           dbytes      sttl  ...  is_ftp_login  ct_ftp_cmd  ct_srv_src  \
596843  -0.080670 -0.425902  ...     -0.132116   -0.170489   -0.203658   
593275   0.058514 -0.425902  ...     -0.132116   -0.170489   -0.203658   
1254949 -0.054673 -0.425902  ...     -0.132116         NaN   -0.665051   
191973  -0.224236 -0.425902  ...     -0.132116   -0.170489    0.534571   
2185322 -0.210878 -0.425902  ...     -0.132116         NaN   -0.019101   

         ct_srv_dst  ct_dst_ltm  ct_src_ltm  ct_src_dport_ltm  \
596843    -0.460981   -0.

In [9]:
# Save to CSV
#X_train.to_csv('X_train.csv', index=False)
#X_test.to_csv('X_test.csv', index=False)
#y_train.to_csv('y_train.csv', index=False)
#y_test.to_csv('y_test.csv', index=False)
#print("CSV files saved.")

In [None]:
# Save to pickle
output_folder = 'C:/Users/raman/OneDrive/Important/1UnisaSTUDY/Courses/Capstone_Project_1/Github/Code Working/Pickle'  # Change this to your desired folder

# Ensure the output folder exists
import os
os.makedirs(output_folder, exist_ok=True)

with open(os.path.join(output_folder, 'X_train.pkl'), 'wb') as f:
    pickle.dump(X_train, f)
with open(os.path.join(output_folder, 'X_test.pkl'), 'wb') as f:
    pickle.dump(X_test, f)
with open(os.path.join(output_folder, 'y_train.pkl'), 'wb') as f:
    pickle.dump(y_train, f)
with open(os.path.join(output_folder, 'y_test.pkl'), 'wb') as f:
    pickle.dump(y_test, f)
print("Pickle files saved.")


Pickle files saved.
