# Data Preprocessing Script for UEBA Risk Scoring
This script prepares the dataset for model training by handling missing values, encoding categorical columns, scaling features, and saving the preprocessing encoders. The encoders are saved as .pkl files, allowing for reusage of them consistently during model training and evaluation.

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import os



## Define file paths

In [2]:

file_path = r'C:\Users\USER\UEBA_Project\risk_scoring\data\raw\train_data.csv'
processed_data_path = r'C:\Users\USER\UEBA_Project\risk_scoring\data\processed'
os.makedirs(processed_data_path, exist_ok=True)


## Load the dataset

In [3]:

data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import os

# Define file paths
file_path = r'C:\Users\USER\UEBA_Project\risk_scoring\data\raw\train_data.csv'
processed_data_path = r'C:\Users\USER\UEBA_Project\risk_scoring\data\processed'
os.makedirs(processed_data_path, exist_ok=True)

# Load the dataset
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Convert the 'time' column to datetime format
data['time'] = pd.to_datetime(data['time'], errors='coerce')

# Separate 'ret' as the target variable
target = data['ret']

# Fill missing values for other columns if any
data.fillna({
    'account': 'Unknown',
    'group': 'Unknown',
    'IP': '0.0.0.0',
    'url': 'unknown',
    'port': data['port'].mode()[0],
    'vlan': data['vlan'].mode()[0],
    'switchIP': '0.0.0.0'
}, inplace=True)

# Extract useful time-based features
data['hour'] = data['time'].dt.hour
data['day_of_week'] = data['time'].dt.dayofweek
data['month'] = data['time'].dt.month

# Drop the original 'time' and 'ret' columns
data.drop(columns=['time', 'ret'], inplace=True)

# Initialize LabelEncoders for categorical columns
label_encoders = {}
categorical_columns = ['account', 'group', 'IP', 'url', 'switchIP']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
    # Save each encoder for future use
    joblib.dump(le, os.path.join(processed_data_path, f'{col}_encoder.pkl'))

# Standardize numerical columns (excluding 'ret')
numerical_columns = ['port', 'vlan', 'hour', 'day_of_week', 'month']
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Save the scaler for future use
joblib.dump(scaler, os.path.join(processed_data_path, 'scaler.pkl'))

# Combine processed features and target for model training
processed_data = data.copy()
processed_data['ret'] = target  # Add 'ret' as the target column

# Save the processed data and target separately
processed_data.to_csv(os.path.join(processed_data_path, 'preprocessed_data_with_target.csv'), index=False)
data.to_csv(os.path.join(processed_data_path, 'preprocessed_features.csv'), index=False)
target.to_csv(os.path.join(processed_data_path, 'target.csv'), index=False, header=['ret'])

# Display the first few rows of the processed dataset
print("First 5 rows of the preprocessed dataset with target:")
print(processed_data.head())


First 5 rows of the preprocessed dataset with target:
   id  account  group   IP   url      port      vlan  switchIP      hour  \
0   1      113      1   18   216 -1.106597 -1.030881        44  0.156337   
1   2      113      1  101   157 -1.472159 -1.030881        91  0.156337   
2   3      113      1   81   373  0.036209 -1.030881        44 -0.078513   
3   4      113      1   39  1135 -0.815599 -1.030881       102  0.391186   
4   5      113      1   77    57  1.092160 -1.030881        92 -0.078513   

   day_of_week     month     ret  
0    -0.497079  0.997434  0.1149  
1    -1.498123  0.997434  0.1801  
2    -0.997601  0.997434  0.3690  
3    -0.497079 -0.999262  0.1532  
4     1.505010  0.997434  0.1449  
