In [2]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn 
from sklearn.model_selection import train_test_split 

import json 
import copy 
import gc 
import os 
import re 
from collections import defaultdict
from pathlib import Path 

from transformers import AutoTokenizer 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from spacy.lang.en import English 
from transformers.tokenization_utils import PreTrainedTokenizerBase 
from transformers.models.deberta_v2 import (
    DebertaV2ForTokenClassification,
    DebertaV2TokenizerFast,
)
from transformers.trainer import Trainer 
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction 
from transformers.data.data_collator import DataCollatorForTokenClassification
from datasets import (
    Dataset, 
    DatasetDict, 
    concatenate_datasets,
    features
)
from transformers import AutoConfig

In [4]:
import argparse 
from itertools import chain 
from functools import partial 

from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding

import random 

In [5]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
SEED = 42
seed_everything(SEED)

In [6]:
TRAINING_MODEL_PATH ='microsoft/deberta-v3-large'

In [7]:
data_1 = pd.read_csv('SQL_questions_fin.csv',header=None)
data_0 = pd.read_csv('TDR_questions_fin.csv',header=None)

data_1['labels'] = 1
data_0['labels'] = 0

data = pd.concat([data_0,data_1],axis=0)
data.columns = ['text_data','label']

print(data_1.shape)
print(data_0.shape)
print(data.shape)

(200, 2)
(200, 2)
(400, 2)


In [8]:
data

Unnamed: 0,text_data,label
0,What security events are associated with the u...,0
1,Analyze all security events on all hosts and i...,0
2,Analyze these events and generate visual repre...,0
3,Analyze security events from each of the hosts...,0
4,List the hosts with their host names that have...,0
...,...,...
195,What recovery group has the most assets with v...,1
196,What recovery group has the most configuration...,1
197,What asset type is most likely to have a vulne...,1
198,What proportion of assets fall into int mainte...,1


In [9]:
class Tokenize(object):
    def __init__(self,train,valid,tokenizer):
        self.tokenizer = tokenizer
        self.train = train
        self.valid =valid 
    
    def get_dataset(self,df):
        ds = Dataset.from_dict({
        'text_data': [ft for ft in df['text_data']],
        'label': [s for s in df['label']]
        })
        
        return ds
    
    def tokenize_function(self, example): 
        tokenized_inputs = self.tokenizer(example['text_data'], truncation=True, max_length= 512)
        
        return tokenized_inputs
    
    def __call__(self):
        
        print(len(self.train))
        print(len(self.valid))
        
        train_ds = self.get_dataset(self.train)
        valid_ds = self.get_dataset(self.valid)
        
        tokenized_train = train_ds.map(self.tokenize_function, batched = True)
        tokenized_valid = valid_ds.map(self.tokenize_function, batched = True)
        
        return tokenized_train, tokenized_valid, self.tokenizer

In [10]:
train,valid = train_test_split(data)

In [11]:
train

Unnamed: 0,text_data,label
47,How many assets does Michael Brown own?,1
110,Describe in detail the activities on host15.,0
16,Create a detailed playbook for the security th...,0
66,List the hosts with their host names that have...,0
153,Compare the attacks on host5 and wstp-spara013.,0
...,...,...
71,What was the source of the attack on host22?,0
106,Analyze all security events on host3 and ident...,0
70,What are the configuration items associated wi...,1
148,How many vulnerabilities are in each Medium?,1


In [12]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
config = AutoConfig.from_pretrained(TRAINING_MODEL_PATH)



In [13]:
tokenize = Tokenize(train,valid,tokenizer)
tokenized_train,tokenized_valid, _ = tokenize()

300
100


Map: 100%|██████████| 300/300 [00:00<00:00, 18308.54 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 12088.72 examples/s]


In [14]:
model = AutoModelForSequenceClassification.from_pretrained(TRAINING_MODEL_PATH)

collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=16)

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
OUTPUT_DIR = os.getcwd()

In [26]:
OUTPUT_DIR 

'/root'

In [16]:
args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    gradient_checkpointing = True, 
    learning_rate = 1e-4,
    num_train_epochs = 30,
    per_device_train_batch_size=4,
    report_to="none",
    evaluation_strategy="steps",
    save_total_limit = 2,
    eval_steps = 2,
    do_eval=False,
    logging_steps=1,
    lr_scheduler_type='cosine',
    metric_for_best_model='accuracy',
    load_best_model_at_end=True, 
    save_strategy='steps',
    greater_is_better=True,
    warmup_ratio=0.1, 
    weight_decay=0.01,
    save_steps=2,
)



In [17]:
from sklearn.metrics import accuracy_score 

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred 
    predictions = np.argmax(logits, axis = -1)
    accuracy = accuracy_score(y_true = labels, y_pred = predictions)
    
    return {"accuracy": accuracy}


In [19]:
trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_train, 
    eval_dataset=tokenized_valid,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
2,0.6589,0.692174,0.52
4,0.6864,0.691414,0.52
6,0.6506,0.690106,0.52
8,0.7482,0.688442,0.52
10,0.7164,0.686247,0.53
12,0.6719,0.683685,0.62
14,0.6908,0.679831,0.9
16,0.6052,0.671641,0.87
18,0.6481,0.635192,0.8
20,0.7297,0.622964,0.8


KeyboardInterrupt: 

In [31]:
modelpath = '/root/checkpoint-52'

In [32]:
model = AutoModelForSequenceClassification.from_pretrained(modelpath)
tokenizer = tokenizer = AutoTokenizer.from_pretrained(modelpath)

In [33]:
test_questions_tdr = pd.read_csv('TDR_questions_test.csv',header=None)
test_questions_sql = pd.read_csv('SQL_questions_test.csv',header=None)
test_questions_sql = test_questions_sql.head(100)
test_questions_tdr['label'] = 0
test_questions_sql['label'] = 1

test_questions = pd.concat([test_questions_tdr,test_questions_sql],axis=0)

In [34]:
test = list(test_questions[0])

In [35]:
from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [36]:
res = []
for i in test:
    prediction = pipe(i)
    res.append(list(prediction[0].values()))

In [37]:
res_df = pd.DataFrame(res, columns = ['pred_label', 'score']) 
res_df['pred_label'].replace(['LABEL_0'], 0, inplace=True)
res_df['pred_label'].replace(['LABEL_1'], 1, inplace=True)
res_df

Unnamed: 0,pred_label,score
0,0,0.853580
1,0,0.845075
2,0,0.839732
3,0,0.829383
4,0,0.855598
...,...,...
196,1,0.917490
197,1,0.986738
198,1,0.990676
199,1,0.989869


In [38]:
test_questions.columns = ['Question','label']

In [39]:
test_questions

Unnamed: 0,Question,label
0,Analyze historical security events to identify...,0
1,Review logs to identify any usage of deprecat...,0
2,Investigate any unauthorized attempts to acce...,0
3,Assess the effectiveness of Data Loss Prevent...,0
4,Review logs for any signs of unauthorized cha...,0
...,...,...
95,What is the total revenue from sales of 'Orga...,1
96,How many orders were placed with expedited sh...,1
97,What is the total number of customer accounts...,1
98,How many products were restocked?,1


In [40]:
test_questions = test_questions.reset_index()
res_df = res_df.reset_index()

In [41]:
result = pd.concat([test_questions, res_df], axis=1)

In [42]:
result = result.drop(columns=['index'])
result

Unnamed: 0,Question,label,pred_label,score
0,Analyze historical security events to identify...,0,0,0.853580
1,Review logs to identify any usage of deprecat...,0,0,0.845075
2,Investigate any unauthorized attempts to acce...,0,0,0.839732
3,Assess the effectiveness of Data Loss Prevent...,0,0,0.829383
4,Review logs for any signs of unauthorized cha...,0,0,0.855598
...,...,...,...,...
196,What is the total revenue from sales of 'Orga...,1,1,0.917490
197,How many orders were placed with expedited sh...,1,1,0.986738
198,What is the total number of customer accounts...,1,1,0.990676
199,How many products were restocked?,1,1,0.989869
