In [2]:
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn 
from sklearn.model_selection import train_test_split 

import json 
import copy 
import gc 
import os 
import re 
from collections import defaultdict
from pathlib import Path 

from transformers import AutoTokenizer 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from spacy.lang.en import English 
from transformers.tokenization_utils import PreTrainedTokenizerBase 
from transformers.models.deberta_v2 import (
    DebertaV2ForTokenClassification,
    DebertaV2TokenizerFast,
)
from transformers.trainer import Trainer 
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction 
from transformers.data.data_collator import DataCollatorForTokenClassification
from datasets import (
    Dataset, 
    DatasetDict, 
    concatenate_datasets,
    features
)
from transformers import AutoConfig

ModuleNotFoundError: No module named 'datasets'

In [None]:
import argparse 
from itertools import chain 
from functools import partial 

from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding

import random 

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
SEED = 42
seed_everything(SEED)

In [None]:
TRAINING_MODEL_PATH ='microsoft/deberta-v3-large'

In [None]:
data_1 = pd.read_csv('SQL_questions.csv',header=None)
data_0 = pd.read_csv('TDR_questions_new.csv',header=None)

data_1['labels'] = 1
data_0['labels'] = 0

data = pd.concat([data_0,data_1],axis=0)
data.columns = ['text_data','label']

print(data_1.shape)
print(data_0.shape)
print(data.shape)

In [7]:
data

Unnamed: 0,text_data,label
0,What security events are associated with the ...,0
1,Analyze all security events on all hosts and ...,0
2,Analyze these events and generate visual repr...,0
3,Analyze security events from each of the host...,0
4,List the hosts with their host names that hav...,0
...,...,...
95,What is the average number of items per order...,1
96,How many orders were placed during promotiona...,1
97,Which products have been part of the most bun...,1
98,How many customers have placed orders on both...,1


In [8]:
class Tokenize(object):
    def __init__(self,train,valid,tokenizer):
        self.tokenizer = tokenizer
        self.train = train
        self.valid =valid 
    
    def get_dataset(self,df):
        ds = Dataset.from_dict({
        'text_data': [ft for ft in df['text_data']],
        'label': [s for s in df['label']]
        })
        
        return ds
    
    def tokenize_function(self, example): 
        tokenized_inputs = self.tokenizer(example['text_data'], truncation=True, max_length= 512)
        
        return tokenized_inputs
    
    def __call__(self):
        
        print(len(self.train))
        print(len(self.valid))
        
        train_ds = self.get_dataset(self.train)
        valid_ds = self.get_dataset(self.valid)
        
        tokenized_train = train_ds.map(self.tokenize_function, batched = True)
        tokenized_valid = valid_ds.map(self.tokenize_function, batched = True)
        
        return tokenized_train, tokenized_valid, self.tokenizer

In [9]:
train,valid = train_test_split(data)

In [10]:
train

Unnamed: 0,text_data,label
14,What is the total amount spent on office supp...,1
73,How many products have been restocked in 2023?,1
5,Analyze the activity of John Doe. Was there a...,0
26,How many orders were placed in the first quar...,1
17,How many different products were sold in Marc...,1
...,...,...
6,What are the names of customers who have made...,1
14,Investigate any alerts triggered by Intrusion...,0
92,Analyze the security of file sharing solution...,0
79,How many orders were placed by customers who ...,1


In [11]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
config = AutoConfig.from_pretrained(TRAINING_MODEL_PATH)



In [12]:
tokenize = Tokenize(train,valid,tokenizer)
tokenized_train,tokenized_valid, _ = tokenize()

150
50


Map: 100%|██████████| 150/150 [00:00<00:00, 10407.36 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 6382.28 examples/s]


In [13]:
model = AutoModelForSequenceClassification.from_pretrained(TRAINING_MODEL_PATH)

collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=16)

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
OUTPUT_DIR = os.getcwd()

In [15]:
args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    gradient_checkpointing = True, 
    learning_rate = 1e-4,
    num_train_epochs = 30,
    per_device_train_batch_size=4,
    report_to="none",
    evaluation_strategy="steps",
    save_total_limit = 2,
    eval_steps = 2,
    do_eval=False,
    logging_steps=1,
    lr_scheduler_type='cosine',
    metric_for_best_model='accuracy',
    load_best_model_at_end=True, 
    save_strategy='steps',
    greater_is_better=True,
    warmup_ratio=0.1, 
    weight_decay=0.01,
    save_steps=2,
)



In [17]:
from sklearn.metrics import accuracy_score 

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred 
    predictions = np.argmax(logits, axis = -1)
    accuracy = accuracy_score(y_true = labels, y_pred = predictions)
    
    return {"accuracy": accuracy}


In [19]:
trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_train, 
    eval_dataset=tokenized_valid,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
2,0.6492,0.689763,0.54
4,0.7075,0.688651,0.54
6,0.64,0.687118,0.54
8,0.6778,0.684539,0.54
10,0.6974,0.680485,0.54
12,0.5923,0.673837,0.54
14,0.7468,0.660566,0.54
16,0.5873,0.581308,0.92
18,0.6052,0.503583,1.0
20,0.4474,0.421583,1.0


In [16]:
modelpath = '/root/checkpoint-50'

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(modelpath)
tokenizer = tokenizer = AutoTokenizer.from_pretrained(modelpath)

In [15]:
test = ['Are there any systems with major threats','Review Logs for any signs of DNS spoofing']

In [16]:
test_questions_tdr = pd.read_csv('TDR_questions_test.csv',header=None)
test_questions_sql = pd.read_csv('SQL_questions_test.csv',header=None)
test_questions_sql = test_questions_sql.head(100)
test_questions_tdr['label'] = 0
test_questions_sql['label'] = 1

test_questions = pd.concat([test_questions_tdr,test_questions_sql],axis=0)

In [17]:
test = list(test_questions[0])

In [18]:
from transformers import TextClassificationPipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [19]:
prediction = pipe('What assets have annual maintenance frequinacy')
prediction[0]

{'label': 'LABEL_1', 'score': 0.8599000573158264}

In [20]:
res = []
for i in test:
    prediction = pipe(i)
    res.append(list(prediction[0].values()))

In [21]:
res_df = pd.DataFrame(res, columns = ['pred_label', 'score']) 
res_df['pred_label'].replace(['LABEL_0'], 0, inplace=True)
res_df['pred_label'].replace(['LABEL_1'], 1, inplace=True)
res_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  res_df['pred_label'].replace(['LABEL_0'], 0, inplace=True)
  res_df['pred_label'].replace(['LABEL_1'], 1, inplace=True)


Unnamed: 0,pred_label,score
0,0,0.999818
1,0,0.999866
2,0,0.999780
3,0,0.999887
4,0,0.999802
...,...,...
196,1,0.999745
197,1,0.999713
198,1,0.999708
199,1,0.999679


In [22]:
test_questions.columns = ['Question','label']

In [23]:
test_questions

Unnamed: 0,Question,label
0,Analyze historical security events to identify...,0
1,Review logs to identify any usage of deprecat...,0
2,Investigate any unauthorized attempts to acce...,0
3,Assess the effectiveness of Data Loss Prevent...,0
4,Review logs for any signs of unauthorized cha...,0
...,...,...
95,What is the total revenue from sales of 'Orga...,1
96,How many orders were placed with expedited sh...,1
97,What is the total number of customer accounts...,1
98,How many products were restocked?,1


In [24]:
res_df

Unnamed: 0,pred_label,score
0,0,0.999818
1,0,0.999866
2,0,0.999780
3,0,0.999887
4,0,0.999802
...,...,...
196,1,0.999745
197,1,0.999713
198,1,0.999708
199,1,0.999679


In [25]:
test_questions = test_questions.reset_index()
res_df = res_df.reset_index()

In [26]:
result = pd.concat([test_questions, res_df], axis=1)

In [27]:
result = result.drop(columns=['index'])
result

Unnamed: 0,Question,label,pred_label,score
0,Analyze historical security events to identify...,0,0,0.999818
1,Review logs to identify any usage of deprecat...,0,0,0.999866
2,Investigate any unauthorized attempts to acce...,0,0,0.999780
3,Assess the effectiveness of Data Loss Prevent...,0,0,0.999887
4,Review logs for any signs of unauthorized cha...,0,0,0.999802
...,...,...,...,...
196,What is the total revenue from sales of 'Orga...,1,1,0.999745
197,How many orders were placed with expedited sh...,1,1,0.999713
198,What is the total number of customer accounts...,1,1,0.999708
199,How many products were restocked?,1,1,0.999679


In [None]:
import

In [28]:
accuracy_score(y_true = result['label'], y_pred = result['pred_label'])

NameError: name 'accuracy_score' is not defined

Add sql agent 

In [4]:
from langchain_community.utilities import SQLDatabase
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [5]:
sqlite_uri = 'sqlite:///./snyth.db' 
db = SQLDatabase.from_uri(sqlite_uri)

In [6]:
from langchain_core.prompts import ChatPromptTemplate

template = """Based on the table schema below, write a SQL query that would answer the user's question:
{schema}

Question: {question}
SQL Query:"""
prompt = ChatPromptTemplate.from_template(template)

In [7]:
def get_schema(_):
    schema = db.get_table_info()
    return schema

In [8]:
key = 'sk-proj-WVAYZM6xMzLkvMYc3p2pT3BlbkFJnlTUh1OzdtEVL1TT7Aq8'

In [9]:
llm = ChatOpenAI(openai_api_key=key)

sql_chain = (
    RunnablePassthrough.assign(schema=get_schema)
    | prompt
    | llm.bind(stop=["\nSQLResult:"])
    | StrOutputParser()
)


In [10]:
template = """Based on the table schema below, question, sql query, and sql response, write a natural language response:
{schema}

Question: {question}
SQL Query: {query}
SQL Response: {response}"""
prompt_response = ChatPromptTemplate.from_template(template)

In [11]:
def run_query(query):
    return db.run(query)

In [12]:
full_chain = (
    RunnablePassthrough.assign(query=sql_chain).assign(
        schema=get_schema,
        response=lambda vars: run_query(vars["query"]),
    )
    | prompt_response
    | llm
)

In [13]:
def sql_answer(user_question):
    return full_chain.invoke({"question": user_question}).content

In [14]:
user_question = 'what is the average customer lifetime value?'
sql_chain.invoke({"question": user_question})

'SELECT AVG(total_vis_count) AS average_customer_lifetime_value\nFROM vulnerability;'

In [40]:
asset id -> asset_id 

SyntaxError: invalid syntax (650395697.py, line 1)

In [41]:
db.run('SELECT AVG(total_vis_count) AS avg_customer_lifetime_value\nFROM vulnerability;')

'[(0.0,)]'

# Routing Agent with Result

In [53]:
def routing_agent(user_question):
    prediction = pipe(user_question)
    print(prediction)
    if prediction[0]['label'] == 'LABEL_1':
        sql_result = sql_answer(user_question)
        return sql_result
    if prediction[0]['label'] == 'LABEL_0':
        tdr_result = "This is a tdr question"
        return tdr_result

In [63]:
question = 'What registered asset has the most vulnerabilities?'
# question = 'What security events occured during the last week?'

In [64]:
routing_agent(question)

[{'label': 'LABEL_1', 'score': 0.9295670986175537}]


"The registered asset with the most vulnerabilities is 'asset0430', which has a total of 10 vulnerabilities associated with it."