In [1]:
import pandas as pd
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from dotenv import dotenv_values
import sys
sys.path.insert(0,'/workspaces/RAG_secure_code_generation/src')
from utils.utils import load_yaml, init_argument_parser, sanitize_output, fill_default_parameters
from langchain.prompts import (
    ChatPromptTemplate, PromptTemplate
)
from utils.openai_utils import is_openai_model, build_chat_model
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
import random
import numpy as np
from functools import partial
from typing import List
from langchain.embeddings import OpenAIEmbeddings


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders import WebBaseLoader
from utils.custom_grobid_parser import CustomGrobidParser
from langchain.docstore.document import Document
from langchain_core.embeddings import Embeddings
import bs4
from langchain_core.runnables import RunnablePassthrough
from utils.rag_utils import build_scientific_papers_loader, build_documents_retriever, format_docs, build_web_page_loader


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [6]:
seed = 156
np.random.seed(seed)
random.seed(seed)

In [5]:
template_file = "../data/templates/complete_function_readable.yaml"
rag_template_file = "../data/rag_templates/basic_rag_suffix.txt"
task_file = "../data/tasks/detect_xss_simple_prompt.txt"
parameters_file = "../data/prompt_parameters/empty.yaml"
papers_folder = "../data/papers/"
model_name = "gpt-3.5-turbo-0613"
db_persist_path = "../data/db/chroma"
db_persist_path_web_test = "../data/db/chroma_web_test"

In [6]:
env = dotenv_values()

In [7]:
template = load_yaml(template_file)
    # load parameters
prompt_parameters = load_yaml(parameters_file)

#read txt containing the task
with open(task_file) as f:
    prompt_parameters["input"] = f.read()
prompt_parameters = fill_default_parameters(prompt_parameters, template["default_parameters"])
with open(rag_template_file) as f:
    template['input'] += "\n" + f.read()

use_openai_api = is_openai_model(model_name)
openai_key = env['OPENAI_API_KEY']
model = ChatOpenAI(temperature=0, openai_api_key=openai_key, model=model_name)

In [9]:
docs =  build_scientific_papers_loader(papers_folder)
retriever = build_documents_retriever(docs, db_persist_path=db_persist_path_web_test)


  if self._task_type is "RETRIEVAL_DOCUMENT":


In [10]:
docs[-2]

Document(page_content='This study seeks to explore the effectiveness of XSS detection models.For this purpose, we provide an XSS adversarial attack model based on the Soft Actor-Critic algorithm.In order to improve the escape rate, we design corresponding mutation strategies for different modules of the XSS attack vectors, and then train a reinforcement learning agent to select strategies, so as to make the detector output wrong results.The experiment results showed that our method can achieve an escape rate of over 92%.This also shows that the detection models based on deep learning can effectively detect XSS attacks, but it cannot resist adversarial attacks.Therefore, the key to ensuring the security of machine learning and deep learning in various applications is to further study the adversarial attack technology and put forward more useful defense methods.', metadata={'text': 'This study seeks to explore the effectiveness of XSS detection models.For this purpose, we provide an XSS 

In [14]:
#url_filter_cheat_sheet = "https://cheatsheetseries.owasp.org/cheatsheets/XSS_Filter_Evasion_Cheat_Sheet.html"
url_testing = "https://owasp.org/www-project-web-security-testing-guide/latest/4-Web_Application_Security_Testing/07-Input_Validation_Testing/01-Testing_for_Reflected_Cross_Site_Scripting"

In [2]:
#docs_online = build_web_page_loader(url_testing)
##retriever_web = build_documents_retriever(docs_online, db_persist_path=db_persist_path_web_test)


NameError: name 'url_testing' is not defined

In [11]:
prompt = ChatPromptTemplate.from_messages([("system", template['input']), ("human", "{input}")])


In [17]:
prompt_parameters["input"]

'def detect_xss(http_get_request: str)->bool: \n""" Check if in the given http_get_request there is an XSS exploit, considering also the possible evasions that an attacker can perform.""" '

In [12]:
comment = "what are the actions that a malicious attack can perform to avoid XSS detection."
ex = "Add ”&#x09” to ”javascript"
rets = retriever.get_relevant_documents(template["input"], k=22)

In [13]:
rets

[Document(page_content='The existing research has the problems of low escape rate and coding misuse.To overcome these shortcomings, we propose mutation rules for different modules of the XSS attack vector.Then, an intelligent agent is trained to generate adversarial examples according to these mutation rules to improve the escape rate.', metadata={'bboxes': "[[{'page': '3', 'x': '318.55', 'y': '442.56', 'h': '239.37', 'w': '7.66'}, {'page': '3', 'x': '306.60', 'y': '453.02', 'h': '55.75', 'w': '7.66'}], [{'page': '3', 'x': '365.75', 'y': '453.02', 'h': '192.10', 'w': '7.66'}, {'page': '3', 'x': '306.60', 'y': '463.48', 'h': '227.35', 'w': '7.66'}], [{'page': '3', 'x': '537.74', 'y': '463.48', 'h': '20.17', 'w': '7.66'}, {'page': '3', 'x': '306.60', 'y': '473.95', 'h': '251.28', 'w': '7.66'}, {'page': '3', 'x': '306.60', 'y': '484.41', 'h': '224.86', 'w': '7.66'}]]", 'file_path': '../data/papers/04 - xss.pdf', 'pages': "('3', '3')", 'paper_title': 'XSS adversarial example attacks based 

In [25]:
chain =  prompt | model | StrOutputParser() | sanitize_output


NameError: name 'prompt' is not defined

In [39]:
prompt_parameters["context"] = retriever | format_docs

In [40]:
prompt_parameters

{'input': 'def detect_xss(http_get_request: str)->bool: \n""" Check if in the given http_get_request there is an XSS exploit, considering also the possible evasions that an attacker can perform.""" ',
 'context': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f4ffa2bbdf0>)
 | RunnableLambda(...)}

In [41]:
print(prompt.format(**prompt_parameters))


System: The user will provide the initial part of the function (function name, parameters with types, return type and a comment describing the purpose of the function, with some optional example. 
You are a coding assistant that writes some python code to complete the user's function. Be sure that the code is syntactically correct and that it returns the correct type. 
The lenght of the code should short, readable, and without redundant checks on the paylaod. 
Return only python code in Markdown format, e.g.:
```python
....
```
Use the following pieces of retrieved context to write a more complete function:
Context: first=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f4ffa2bbdf0>) last=RunnableLambda(...)

Human: def detect_xss(http_get_request: str)->bool: 
""" Check if in the given http_get_request there is an XSS exploit, considering also the possible evasions that an attacker can perform.""" 


In [42]:
response = chain.invoke(prompt_parameters)


In [43]:
response

'\nimport re\n\ndef detect_xss(http_get_request: str) -> bool:\n    # Regular expression pattern to match potential XSS payloads\n    xss_pattern = re.compile(r"<script.*?>.*?</script>|<.*?on\\w+=.*?>")\n\n    # Check if the HTTP GET request contains any potential XSS payload\n    if re.search(xss_pattern, http_get_request):\n        return True\n    else:\n        return False\n'