# RL for imbalanced 10-class classification

## Imports and data fetch

In [None]:
import os
import pandas as pd
import numpy as np
import random
import time
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import gym

from scipy.stats import zscore

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.callbacks import BaseCallback

from imblearn.over_sampling import SMOTE

tf.test.gpu_device_name()

In [None]:
train  = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_training-set.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_testing-set.csv')

df = pd.concat([train,test]).drop(['id', 'label'], axis=1)

## Data preprocessing

In [None]:
NOMINAL_COLS = ['proto', 'service', 'state']
BINARY_COLS = ['is_sm_ips_ports', 'is_ftp_login']

for name in df.columns:
    if name == "attack_cat" or name in NOMINAL_COLS or name in BINARY_COLS:
        pass
    else:
        df[name] = zscore(df[name])

for name in NOMINAL_COLS:
    df = pd.concat([df, pd.get_dummies(df[name], prefix=name)],axis=1)
    df.drop(name,axis=1,inplace=True)

## Reward bias calculation for RL

In [None]:
props_pc = df['attack_cat'].value_counts(normalize=True).to_frame()
props_pc.sort_index(inplace=True)
display(props_pc)

In [None]:
props_pc['attack_cat'].mean()
MINORITY_CLASSES = [0, 1, 2, 4, 7, 8, 9]

In [None]:
att_nums = df.groupby('attack_cat')['attack_cat'].count()
proportions = att_nums.values
display(proportions)

In [None]:
rw_factor = ((max(proportions)/proportions)).astype(int)
display(rw_factor)

## Data normalization, split, optional smote, shuffle 

In [None]:
x = df.drop(['attack_cat'], axis=1).values

# Make the values [0, 1] for the RL environment
min_max_scaler = preprocessing.MinMaxScaler()
x = min_max_scaler.fit_transform(x)

dummies = pd.get_dummies(df['attack_cat'])
attack_cat_list = list(dummies.columns)
y = dummies.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# sm = SMOTE(random_state=42)
# x_train, y_train = sm.fit_resample(x_train, y_train)

In [None]:
x_train, y_train = shuffle(x_train, y_train, random_state=42)

## Benchmark functions

In [None]:
models_benchmark = pd.DataFrame(columns=['Accuracy', 'Recall', 'Precision', 'F1-score'])

def getMetrics(model_name, y_truth, y_pred, average):
    print(f'Metrics {average}-averaged')
    acc = metrics.accuracy_score(y_truth, y_pred)
    rec = metrics.recall_score(y_truth, y_pred, average=average, zero_division=0)
    pre = metrics.precision_score(y_truth, y_pred, average=average, zero_division=0)
    f1s = metrics.f1_score(y_truth, y_pred, average=average, zero_division=0)

    models_benchmark.loc[model_name] = [acc, rec, pre, f1s]
    display(models_benchmark.loc[model_name])
    return

def getCM(model_name, y_test, y_pred):
    cm = metrics.confusion_matrix(y_test, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(10,10))
    plt.title(f'{model_name}', fontsize=20)

    sns.heatmap(cm_normalized, annot=True, fmt='.2f', xticklabels=attack_cat_list, yticklabels=attack_cat_list)

    plt.xticks(rotation=45)
    plt.ylabel('Real outcome', fontsize=16)
    plt.xlabel('Predicted outcome', fontsize=16)
    plt.ioff()
    plt.show()
    return

## Environment for RL

In [None]:
class CustomEnv(gym.Env):
    def __init__(self, dataset=(x_train, y_train), rw_factor=rw_factor, minority_classes=MINORITY_CLASSES):
        
        super().__init__()

        self.x, self.y = dataset
        self.rw_factor = rw_factor
        self.idx = 0
        self.action_space = gym.spaces.Discrete(self.y.shape[1])
        self.observation_space = gym.spaces.Box(low=0, high=1,
                                                shape=(self.x.shape[1], ),
                                                dtype=np.float32)
        
        self.minority_classes = minority_classes
        self.step_counter = 0
        self.max_steps = 20e3

    def step(self, action):
        done = False
        
        # Reward management
        if int(action == self.expected_action):
            reward = self.rw_factor[self.expected_action]
        else:
            reward = -self.rw_factor[self.expected_action]
                
        # Observation management
        self.idx += 1
        
        if self.idx >= self.x.shape[0]:
            self.idx = 0
        
        obs = self.seq_observation()

        # Done management
        self.step_counter += 1
        
        if self.step_counter >= self.max_steps:
            done = True

        return obs, reward, done, {}

    def reset(self):
        self.step_counter = 0
        obs = self.seq_observation()
        return obs
    
    def seq_observation(self):        
        obs = self.x[self.idx]
        self.expected_action = int(np.argmax(self.y[self.idx]))
        return obs

## Callback for best model auto-save

In [None]:
class SaveBest(BaseCallback):
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 0):
        super(SaveBest, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf
    
    def _init_callback(self) -> None:
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
            
    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-100:])
                
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    
                    if self.verbose >= 0:
                        print(f"Num timesteps: {self.num_timesteps}")
                        print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")
                        print(f"Saving new best model to {self.save_path}")
                        
                    self.model.save(self.save_path)   
        return True

In [None]:
log_dir = './tmp_log/'
os.makedirs(log_dir, exist_ok=True)

train_env = CustomEnv()
train_env = Monitor(train_env, log_dir)

train_env.reset()

print(x_train.shape)

## Model creation / load

In [None]:
# Choose between PPO or A2C, since SB3 only provides vanilla DQN

ALGO = 'PPO'
callback = SaveBest(check_freq=10e3, log_dir=log_dir)
model = PPO(policy='MlpPolicy', env=train_env, tensorboard_log=f'./{ALGO}')

In [None]:
# model = PPO.load(path=f'./models/{ALGO}.zip', env=env)

In [None]:
# model.save(f'./models/{ALGO}')

## Model training / save

In [None]:
# Min 5M timesteps for relevant results

EPOCHS = 20
TIMESTEPS = x_train.shape[0]

for i in range(1, EPOCHS+1):
    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f'{ALGO}_TB_LOG', callback=callback)
    y_pred = model.predict(x_test)[0]
    y_train_pred = model.predict(x_train)[0]
    getMetrics(f'RUN{i} - Test', np.argmax(y_test, axis=1), y_pred, 'macro')
    getMetrics(f'RUN{i} - Train', np.argmax(y_train, axis=1), y_train_pred, 'macro')
    print('--------------------')

## Compute metrics and CM for the current model

In [None]:
getMetrics(f'RUN{i} - Train', np.argmax(y_test, axis=1), y_pred, 'macro')
print('----- ----- -----')
getMetrics(f'RUN{i} - Train', np.argmax(y_test, axis=1), y_pred, 'weighted')
getCM('PPO_T', np.argmax(y_test, axis=1), y_pred)