In [1]:
import pandas as pd
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from dotenv import dotenv_values
import sys
sys.path.insert(0,'/workspaces/RAG_secure_code_generation/src')
from utils.utils import load_yaml, init_argument_parser, sanitize_output, fill_default_parameters
from langchain.prompts import (
    ChatPromptTemplate, PromptTemplate
)
from utils.openai_utils import is_openai_model, build_chat_model
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
import random
import numpy as np
from functools import partial
from typing import List
from langchain.embeddings import OpenAIEmbeddings


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import GrobidParser
from langchain.docstore.document import Document
from langchain_core.embeddings import Embeddings


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 156
np.random.seed(seed)
random.seed(seed)

In [3]:
template_file = "../data/templates/complete_function_readable.yaml"
rag_template_file = "../data/rag_templates/basic_rag_suffix.txt"
task_file = "../data/tasks/detect_xss_simple_prompt.txt"
parameters_file = "../data/prompt_parameters/empty.yaml"
papers_folder = "/workspaces/RAG_secure_code_generation/data/papers"
model_name = "gpt-3.5-turbo-0613"

In [4]:
env = dotenv_values()

In [5]:
template = load_yaml(template_file)
    # load parameters
prompt_parameters = load_yaml(parameters_file)

#read txt containing the task
with open(task_file) as f:
    prompt_parameters["input"] = f.read()
prompt_parameters = fill_default_parameters(prompt_parameters, template["default_parameters"])
use_openai_api = is_openai_model(model_name)
openai_key = env['OPENAI_API_KEY']
model = ChatOpenAI(temperature=1, openai_api_key=openai_key, model=model_name)

In [6]:
def build_scientific_papers_loader(papers_folder:str)->List[Document]: 
    loader = GenericLoader.from_filesystem(
        papers_folder,
        glob="*",
        suffixes=[".pdf"],
        #parser=GrobidParser(segment_sentences=False),
    )
    docs = loader.load()
    return docs

In [7]:
def build_documents_retriever(docs:List[Document],
                              embeddings:Embeddings,
                              chunk_size:int=1000,
                              chunk_overlap:int=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splits = text_splitter.split_documents(docs)

    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
    retriever = vectorstore.as_retriever()
    return retriever

In [8]:
docs = build_scientific_papers_loader(papers_folder)
retriever = build_documents_retriever(docs, embeddings=OpenAIEmbeddings())

RuntimeError: Directory 'static/' does not exist