In [None]:
!pip install transformers datasets torch optuna

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (f

In [None]:
!pip install transformers[torch] accelerate -U
!pip install optuna

Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting transformers[torch]
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, accelerate
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.32.1
    Unin

In [None]:
!pip install transformers datasets torch



In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DistilBertForSequenceClassification
from datasets import Dataset, DatasetDict
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes",
             "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
             "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
             "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
             "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
             "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
             "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
             "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
             "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]

file_path = '/content/drive/MyDrive/NSL_KDD_Train(2).csv'
df = pd.read_csv(file_path, header=None, names=col_names)

In [None]:
# Define label encoding
label_class_dict = {
    'normal': 0, 'neptune': 1, 'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,
    'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
    'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'satan': 2, 'mscan': 2, 'saint': 2,
    'ftp_write': 3, 'guess_passwd': 3, 'imap': 3, 'multihop': 3, 'phf': 3, 'spy': 3,
    'warezclient': 3, 'warezmaster': 3, 'sendmail': 3, 'named': 3, 'snmpgetattack': 3,
    'snmpguess': 3, 'xlock': 3, 'xsnoop': 3, 'httptunnel': 3, 'buffer_overflow': 4,
    'loadmodule': 4, 'perl': 4, 'rootkit': 4, 'ps': 4, 'sqlattack': 4, 'xterm': 4
}

df['label'] = df['label'].replace(label_class_dict)

In [None]:
# Convert features to strings
categorical_cols = ['protocol_type', 'service', 'flag']
df[categorical_cols] = df[categorical_cols].astype(str)

In [None]:
# Combine features into a single string
df['text'] = df.drop(columns=['label']).apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [None]:
# Create a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df[['text', 'label']])
dataset_dict = DatasetDict({'train': hf_dataset})

In [None]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [None]:
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/125973 [00:00<?, ? examples/s]

In [None]:
# Split the dataset
train_test_split = tokenized_datasets['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [None]:
# Load the teacher model
teacher_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the student model
student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class DistillationTrainer(Trainer):
    def __init__(self, teacher_model, student_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model.to(self.args.device)  # Move teacher model to specified device
        self.student_model = student_model.to(self.args.device)  # Move student model to specified device

    def compute_loss(self, model, inputs, return_outputs=False):
        # Ensure inputs are on the right device
        inputs = {k: v.to(self.args.device) for k, v in inputs.items()}

        # Forward pass with teacher model
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)
            teacher_logits = teacher_outputs.logits

        # Forward pass with student model
        student_outputs = model(**inputs)
        student_logits = student_outputs.logits

        # Compute distillation loss
        loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
        loss = loss_fct(F.log_softmax(student_logits / 2.0, dim=-1), F.softmax(teacher_logits / 2.0, dim=-1))

        # Compute the traditional classification loss
        classification_loss = F.cross_entropy(student_logits, inputs["labels"])

        # Combine the losses
        combined_loss = loss + classification_loss
        return (combined_loss, student_outputs) if return_outputs else combined_loss

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
from transformers import TrainingArguments, TrainerCallback
import numpy as np

# Custom callback to store metrics
class MetricsCallback(TrainerCallback):
    """A callback that stores the loggable metrics after each evaluation and training."""
    def __init__(self):
        self.train_history = {}
        self.eval_history = {}

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Logs training metrics
        if 'loss' in logs:
            if 'train_loss' not in self.train_history:
                self.train_history['train_loss'] = []
            self.train_history['train_loss'].append(logs['loss'])

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # Logs evaluation metrics
        for key, value in metrics.items():
            if key not in self.eval_history:
                self.eval_history[key] = []
            self.eval_history[key].append(value)

# Define the objective function for hyperparameter tuning
def objective(trial):
    # Hyperparameter space
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=32,
        num_train_epochs=2,
        eval_strategy='epoch',
        save_strategy='epoch',
        logging_dir='./logs',
        logging_steps=10,
    )

    # Initialize callback and trainer
    metrics_callback = MetricsCallback()

    trainer = DistillationTrainer(
        teacher_model=teacher_model,
        student_model=student_model,
        model=student_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[metrics_callback]
    )

    # Training the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()

    # Predict and calculate accuracy
    predictions, labels, _ = trainer.predict(eval_dataset)
    preds = np.argmax(predictions, axis=-1)
    accuracy = accuracy_score(labels, preds)
    print(f'Accuracy: {accuracy}')

    # Plotting training and validation loss
    plt.figure(figsize=(10, 5))
    epochs = range(1, len(metrics_callback.train_history['train_loss']) + 1)
    plt.plot(epochs, metrics_callback.train_history['train_loss'], 'bo-', label='Training Loss')
    plt.plot(epochs, metrics_callback.eval_history['eval_loss'], 'ro-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # Plotting the confusion matrix
    cm = confusion_matrix(labels, preds)
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.show()

    return eval_result['eval_loss']

In [None]:
import optuna
from transformers import TrainingArguments, TrainerCallback, Trainer
import numpy as np

class MetricsCallback(TrainerCallback):
    """Callback that logs the evaluation metrics."""
    def __init__(self):
        super().__init__()
        self.eval_history = {'eval_loss': []}

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'eval_loss' in logs:
            self.eval_history['eval_loss'].append(logs['eval_loss'])

def train_model():
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=32,
        num_train_epochs=2,
        eval_strategy='epoch',
        logging_dir='./logs',
        logging_strategy='epoch',  # Ensure logging at epoch level
    )

    metrics_callback = MetricsCallback()

    trainer = DistillationTrainer(
        teacher_model=teacher_model,
        student_model=student_model,
        model=student_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[metrics_callback]
    )

    trainer.train()
    trainer.evaluate()
    return metrics_callback

# Train the model and save the metrics
metrics_callback = train_model()

Epoch,Training Loss,Validation Loss
1,0.4752,0.457362
2,0.4577,0.454982
