In [26]:
import pandas as pd
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from dotenv import dotenv_values
import sys
sys.path.insert(0,'/workspaces/RAG_secure_code_generation/src')
from utils.utils import load_yaml, init_argument_parser, sanitize_output, fill_default_parameters
from langchain.prompts import (
    ChatPromptTemplate, PromptTemplate
)
from utils.openai_utils import is_openai_model, build_chat_model
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
import random
import numpy as np
from functools import partial
from typing import List
from langchain.embeddings import OpenAIEmbeddings


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders import WebBaseLoader
from utils.custom_grobid_parser import CustomGrobidParser
from langchain.docstore.document import Document
from langchain_core.embeddings import Embeddings
import bs4
from langchain_core.runnables import RunnablePassthrough
from utils.rag_utils import build_scientific_papers_loader, build_documents_retriever, format_docs, build_web_page_loader
from utils.openai_utils import is_openai_model, build_chat_model


In [27]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [28]:
seed = 156
np.random.seed(seed)
random.seed(seed)

In [29]:
template_file = "../data/templates/complete_function_readable.yaml"
rag_template_file = "../data/rag_templates/basic_rag_suffix.txt"
task_file = "../data/tasks/detect_xss_simple_prompt.txt"
parameters_file = "../data/prompt_parameters/empty.yaml"
papers_folder = "../data/papers/"
model_name = "gpt-3.5-turbo-0613"
db_persist_path = "../data/db/chroma"
db_persist_path_web_test = "../data/db/chroma_web_test"
db_persist_path_blog = "../data/db/chroma_web_blog"
db_persist_path_book = "../data/db/chroma_web_book"

In [30]:
env = dotenv_values()

In [31]:
template = load_yaml(template_file)
    # load parameters
prompt_parameters = load_yaml(parameters_file)

#read txt containing the task
with open(task_file) as f:
    prompt_parameters["input"] = f.read()
prompt_parameters = fill_default_parameters(prompt_parameters, template["default_parameters"])
with open(rag_template_file) as f:
    template['input'] += "\n" + f.read()

use_openai_api = is_openai_model(model_name)
openai_key = env['OPENAI_API_KEY']
model = ChatOpenAI(temperature=0, openai_api_key=openai_key, model=model_name)
embeddings = OpenAIEmbeddings(api_key = openai_key)


In [32]:
#docs =  build_scientific_papers_loader(papers_folder)
#retriever = build_documents_retriever(docs, db_persist_path=db_persist_path_web_test)


In [33]:
url = "https://n3t-hunt3r.gitbook.io/pentest-book/"
docs =  build_web_page_loader(url)

retriever = build_documents_retriever(docs, db_persist_path=db_persist_path_book, embeddings=embeddings, chunk_size=5000, chunk_overlap = 500)


In [42]:
len(docs)

1

In [43]:
#url_filter_cheat_sheet = "https://cheatsheetseries.owasp.org/cheatsheets/XSS_Filter_Evasion_Cheat_Sheet.html"
url_testing = "https://owasp.org/www-project-web-security-testing-guide/latest/4-Web_Application_Security_Testing/07-Input_Validation_Testing/01-Testing_for_Reflected_Cross_Site_Scripting"

In [44]:
#docs_online = build_web_page_loader(url_testing)
##retriever_web = build_documents_retriever(docs_online, db_persist_path=db_persist_path_web_test)


In [45]:
prompt = ChatPromptTemplate.from_messages([("system", template['input']), ("human", "{input}")])


In [46]:
prompt_parameters["input"]

'def detect_xss(http_get_request: str)->bool: \n""" Check if in the given http_get_request there is an XSS exploit, considering also the possible evasions that an attacker can perform.""" '

In [47]:
prompt_with_examples = '''
def detect_xss(http_get_request: str)->bool: 
""" Check if in the given http_get_request there is an XSS exploit, considering also the possible evasions that an attacker can perform.""" 

'''

In [57]:
comment = "what are the escapes that a malicious attack can perform to bypass XSS detection."
ex = "unescape(/%78%u0073%73/.source)"
example = "http://www.concours.agriculture.gouv.fr/calendrier/sommaire.cfm?cal_cdn=%22%3e'%3e%3cscript%20src=ht<br>tp://vuln.xssed.net/thirdparty/scripts/python5.js%3e%3c/script%3e"
rets = retriever.get_relevant_documents(comment, k=22)

In [58]:
rets

[Document(page_content='Header InjectionUnicode Normalization vulnerabilityRegistration VulnerabilitiesRace ConditionPowered By GitBookXSS Filter Evasion and WAF Bypassing TacticsWe will analyze various levels of evasion and bypassing tactics for XSS payloads.IntroductionCross-Site Scripting (XSS) attacks are a type of injection in which malicious scripts are injected into otherwise trustworthy websites. The flaws that allow these attacks to succeed are common and can be found whenever a web application accepts user input in its output without verifying or encoding it.Many security researchers have created guides and cheat sheets to aid security professionals in the testing of Cross-Site Scripting problems over the years. The most well-known is "XSS Filter Evasion Cheat Sheet," which was produced by RSnake and then donated to OWASP. Cure53\'s HTML5 Security Cheatsheet is another intriguing initiative.In this book, we will not analyze the vectors reported in the cheat sheet one by one, 

In [20]:
chain =  prompt | model | StrOutputParser() | sanitize_output


In [21]:
prompt_parameters["context"] = retriever | format_docs

In [22]:
prompt_parameters

{'input': 'def detect_xss(http_get_request: str)->bool: \n""" Check if in the given http_get_request there is an XSS exploit, considering also the possible evasions that an attacker can perform.""" ',
 'context': VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f9a909cae90>)
 | RunnableLambda(...)}

In [23]:
print(prompt.format(**prompt_parameters))


System: The user will provide the initial part of a python function (function name, parameters with types, return type and a comment describing the purpose of the function) with some optional example. 
You are a coding assistant that writes some python code to complete the user's function. Be sure that the code is syntactically correct and that it returns the correct type. 
The length of the code should be short, it should be readable, and without redundant checks on the payload. 
Return only python code in Markdown format, e.g.:
```python
....
```
Use the following pieces of retrieved context to write a more complete function:
Context: first=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f9a909cae90>) last=RunnableLambda(...)

Human: def detect_xss(http_get_request: str)->bool: 
""" Check if in the given http_get_request there is an XSS exploit, considering also the possible evasions that an attacker can perform

In [24]:
response = chain.invoke(prompt_parameters)


In [25]:
response

'\nimport re\n\ndef detect_xss(http_get_request: str) -> bool:\n    # Check for common XSS patterns\n    patterns = [\n        r"<script.*?>.*?</script>",\n        r"javascript:",\n        r"on\\w+",\n        r"alert\\(",\n        r"eval\\(",\n        r"document\\."\n    ]\n    \n    for pattern in patterns:\n        if re.search(pattern, http_get_request, re.IGNORECASE):\n            return True\n    \n    return False\n'