In [1]:
import pandas as pd
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from dotenv import dotenv_values
import sys
sys.path.insert(0,'/workspaces/RAG_secure_code_generation/src')
from utils.utils import load_yaml, init_argument_parser, sanitize_output, fill_default_parameters
from langchain.prompts import (
    ChatPromptTemplate, PromptTemplate
)
from utils.openai_utils import is_openai_model, build_chat_model
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
import random
import numpy as np
from functools import partial
from typing import List
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers import CommaSeparatedListOutputParser


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders import WebBaseLoader
from utils.custom_grobid_parser import CustomGrobidParser
from langchain.docstore.document import Document
from langchain_core.embeddings import Embeddings
import bs4
from langchain_core.runnables import RunnablePassthrough
from utils.rag_utils import build_scientific_papers_loader, build_documents_retriever, format_docs, build_web_page_loader
from utils.openai_utils import is_openai_model, build_chat_model
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import (
    SystemMessagePromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from pydantic.v1 import Field ,BaseModel, validator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 156
np.random.seed(seed)
random.seed(seed)

In [3]:
model_name = "gpt-4-0125-preview"


In [93]:
template_file = "../data/templates/create_synthetic_dataset.yaml"
task_file = "../data/tasks/detect_xss_simple_prompt.txt"
#task_file = "../data/tasks/is_even.txt"

parameters_file = "../data/prompt_parameters/syn_xss_100.yaml"

In [94]:
env = dotenv_values()

In [95]:
openai_key = env['OPENAI_API_KEY']
model = ChatOpenAI(temperature=0, openai_api_key=openai_key, model=model_name)
embeddings = OpenAIEmbeddings(api_key = openai_key)

In [96]:
template = load_yaml(template_file)
    # load parameters
prompt_parameters = load_yaml(parameters_file)

#read txt containing the task
with open(task_file) as f:
    prompt_parameters["input"] = f.read()
prompt_parameters = fill_default_parameters(prompt_parameters, template["default_parameters"])
use_openai_api = is_openai_model(model_name)
openai_key = env['OPENAI_API_KEY']
model = ChatOpenAI(temperature=0, openai_api_key=openai_key, model=model_name)
embeddings = OpenAIEmbeddings(api_key = openai_key)

In [97]:
class XSS_row(BaseModel):
    Payloads: str = Field(description="a string representing an http get request with payload")
    Class: str = Field(description="a string representing the class of the http get request, it is Malicious if the http get request contains an xss attack, otherwise it is Benign")

In [115]:
class XSS_dataset(BaseModel):
    dataset: List[XSS_row]

In [98]:
class Number(BaseModel):
    Number: int = Field(description="an int to be tested if it is even or odd")
    Is_Even: bool = Field(description="a boolean that it is True if the number is even, otherwise it is False")

In [99]:
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field

In [116]:
output_parser = PydanticOutputParser(pydantic_object=XSS_dataset)
format_instructions = output_parser.get_format_instructions()
format_instructions

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"dataset": {"title": "Dataset", "type": "array", "items": {"$ref": "#/definitions/XSS_row"}}}, "required": ["dataset"], "definitions": {"XSS_row": {"title": "XSS_row", "type": "object", "properties": {"Payloads": {"title": "Payloads", "description": "a string representing an http get request with payload", "type": "string"}, "Class": {"title": "Class", "description": "a string representing the class of the http get request, it is Malicious if the http get request contains an xss attack, otherwise it is Benign", "type": 

In [117]:
template['input']

'The user will provide the initial part of the function (function name, parameters with types, return type and a comment describing the purpose of the function, with some optional example. \nYou are a testing assistant that generates a dataset to test the function provided by the user.\nThe dataset should contain {rows} Malicious examples and {rows} Benign examples. '

In [118]:
even_input = """The user will provide the initial part of the function (function name, parameters with types, return type and a comment describing the purpose of the function, with some optional example. 
You are a testing assistant that generates a dataset to test the function provided by the user.
The dataset should contain 5 even examples and 5 odd examples. 
\n{format_instructions}\n
Function:
{input}
"""
even_input

'The user will provide the initial part of the function (function name, parameters with types, return type and a comment describing the purpose of the function, with some optional example. \nYou are a testing assistant that generates a dataset to test the function provided by the user.\nThe dataset should contain 5 even examples and 5 odd examples. \n\n{format_instructions}\n\nFunction:\n{input}\n'

In [119]:

prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(template['input']+ "\n{format_instructions}"),
        HumanMessagePromptTemplate.from_template("{input}")  
    ],
    input_variables=["input"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)
"""
prompt = PromptTemplate(
    template=even_input,
    input_variables=["input", "rows"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)
"""

'\nprompt = PromptTemplate(\n    template=even_input,\n    input_variables=["input", "rows"],\n    partial_variables={"format_instructions": output_parser.get_format_instructions()},\n)\n'

In [120]:
input_variables=["input"]

In [121]:
chain = prompt | model 



In [122]:
print(prompt.format(input=prompt_parameters["input"], rows = 5))


System: The user will provide the initial part of the function (function name, parameters with types, return type and a comment describing the purpose of the function, with some optional example. 
You are a testing assistant that generates a dataset to test the function provided by the user.
The dataset should contain 5 Malicious examples and 5 Benign examples. 
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"dataset": {"title": "Dataset", "type": "array", "items": {"$ref": "#/definitions/XSS_row"}}}, "required": ["dataset"], "definitions": {"XSS_row": {"title": "XSS_row", 

In [123]:
response = chain.invoke(prompt_parameters)
#response = chain.invoke({"query": "Tell me a joke."})

In [124]:
response

AIMessage(content='```json\n{\n  "dataset": [\n    {\n      "Payloads": "/search?q=<script>alert(\'XSS\')</script>",\n      "Class": "Malicious"\n    },\n    {\n      "Payloads": "/index.html?user=<img src=x onerror=alert(\'XSS\')>",\n      "Class": "Malicious"\n    },\n    {\n      "Payloads": "/contact?name=<body onload=alert(\'XSS\')>",\n      "Class": "Malicious"\n    },\n    {\n      "Payloads": "/login?username=admin&password=<svg/onload=alert(\'XSS\')>",\n      "Class": "Malicious"\n    },\n    {\n      "Payloads": "/about?comment=<iframe src=javascript:alert(\'XSS\')>",\n      "Class": "Malicious"\n    },\n    {\n      "Payloads": "/search?q=summer%20vacations",\n      "Class": "Benign"\n    },\n    {\n      "Payloads": "/index.html?user=JohnDoe",\n      "Class": "Benign"\n    },\n    {\n      "Payloads": "/contact?name=Jane+Doe&email=jane.doe@example.com",\n      "Class": "Benign"\n    },\n    {\n      "Payloads": "/login?username=user&password=pass123",\n      "Class": "Benig

In [125]:
output_parser.invoke(response)

XSS_dataset(dataset=[XSS_row(Payloads="/search?q=<script>alert('XSS')</script>", Class='Malicious'), XSS_row(Payloads="/index.html?user=<img src=x onerror=alert('XSS')>", Class='Malicious'), XSS_row(Payloads="/contact?name=<body onload=alert('XSS')>", Class='Malicious'), XSS_row(Payloads="/login?username=admin&password=<svg/onload=alert('XSS')>", Class='Malicious'), XSS_row(Payloads="/about?comment=<iframe src=javascript:alert('XSS')>", Class='Malicious'), XSS_row(Payloads='/search?q=summer%20vacations', Class='Benign'), XSS_row(Payloads='/index.html?user=JohnDoe', Class='Benign'), XSS_row(Payloads='/contact?name=Jane+Doe&email=jane.doe@example.com', Class='Benign'), XSS_row(Payloads='/login?username=user&password=pass123', Class='Benign'), XSS_row(Payloads='/about?comment=Great+site%21+Loved+the+content.', Class='Benign')])