In [1]:
!pip install nltk
!pip install langchain
!pip install scikit-learn
!pip install openai
!pip install chromadb
!pip install tiktoken
!pip install gitpython
!pip install -U langchain
!pip install bs4
!pip install sumy
!pip install sentence_transformers

Collecting langchain
  Downloading langchain-0.0.234-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.0.232
    Uninstalling langchain-0.0.232:
      Successfully uninstalled langchain-0.0.232
Successfully installed langchain-0.0.234


In [2]:
!pip install einops
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [102]:
import os
import subprocess
import random
import time
import urllib.request
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import spacy
import requests
import numpy as np
import pandas as pd

from langchain.document_loaders import WebBaseLoader, GitLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.tools import BaseTool
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.agents import AgentOutputParser
from langchain import PromptTemplate
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from langchain.schema import AgentAction, AgentFinish, OutputParserException

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

from typing import List, Union, Optional, Type
from pydantic import root_validator, BaseModel, Field, ValidationError, validator
import openai

In [2]:
OPENAI_API_KEY = ["LIST_OF_YOUR_OPENAI_API_KEYS"]
GOOGLE_API_KEY = "YOUR_GOOGLE_API_KEY"
GOOGLE_CX = "YOUR_SEARCH_ENGINE_CX"
os.environ["SERPER_API_KEY"] = "YOUR_SERPER_API_KEY"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "YOUR_HUGGINGFACE_KEY"

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download("punkt")
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

from langchain import HuggingFaceHub
# repo_id = "tiiuae/falcon-7b"
# llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 64})
llm = OpenAI(openai_api_key=OPENAI_API_KEY[random.randint(0, len(OPENAI_API_KEY)-1)])
nlp = spacy.load("en_core_web_md")

[nltk_data] Downloading package stopwords to /home/vijay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vijay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/vijay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Creating Code Store

Use the GithubLoader to Load the Repository to get the documents.
They store it into ChromaDB.

In [146]:
class CodeStore:

    def __init__(
            self,
            openai_api_key: str,
            github_url: str,
            local_path: str,
            branch: str
    ):
        self.github_url = github_url
        self.local_path = local_path
        self.branch = branch
        self.OPENAI_API_KEY = openai_api_key
        self.documents = None
        last_slash_index = github_url.rfind("/")
        last_suffix_index = github_url.rfind(".git")
        if last_suffix_index < 0:
            last_suffix_index = len(github_url)
        self.persist_directory = github_url[last_slash_index+1 : last_suffix_index]
        self.embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.code_store = None
        self.question_answer = None
        self.chat_history = []

    def generate_documents(self):
        self.clear_files()
        loader = GitLoader(
            clone_url = self.github_url,
            repo_path = self.local_path,
            branch = self.branch
        )

        docs = loader.load()

        text_splitter = CharacterTextSplitter(
            chunk_size = 1500,
            chunk_overlap = 0
        )

        self.documents = text_splitter.split_documents(docs)
        # documents are generated

    def create_vector_store(self):
        self.code_store = Chroma.from_documents(
            documents=self.documents,
            embedding=self.embedding_function,
            persist_directory=self.persist_directory,
        )

        prompt = PromptTemplate.from_template(
            """
                Question: {question}
                Instructions: \n
                    Respond to the human as helpfully and accurately as possible. You can look into the github repository, YOU HAVE BEEN GIVEN ACCESS TO ALL FILES.

                    Output Format must be:
                        Explanation : <Give Explanation here> \n
                        Code: <give the relevant code here>\n

                Example:
                    OUTPUT:
                        EXPLANATION: This folder contains ...
                        Code:
                            ```
                            def function_Get():
                                pass
                            ```

                    OUTPUT:
                        EXPLANATION: The function present in the code ...
                        Code:
                            ```
                            def myfunction():
                                pass
                            ```

                    OUTPUT:
                        EXPLANATION: This Algorithm is used to ...
                        Code:
                            ```
                            def algorith():
                                pass
                            ```
            """
        )

        self.code_store.persist()

        self.question_answer = ConversationalRetrievalChain.from_llm(
            ChatOpenAI(
                openai_api_key = self.OPENAI_API_KEY,
                temperature = 0
            ),
            self.code_store.as_retriever(),
            return_source_documents = True,
            condense_question_prompt=prompt
            # get_chat_history = self.get_chat_history
        )

    # def get_chat_history(inputs) -> str:
    #     res = []
    #     for human, ai in inputs:
    #         res.append(f"Human:{human}\nAI:{ai}")
    #     return "\n".join(res)

    def search_code(self, query: str):
        self.chat_history = []
        result = self.question_answer({
            "question": query,
            "chat_history": self.chat_history
        })
        self.chat_history.append((query,result["answer"]))

        return result

    def clear_files(self):
        try:
            subprocess.run(['rm', '-r', self.persist_directory], check=True)
            subprocess.run(['rm', '-r', self.local_path], check=True)
            return True
        except subprocess.CalledProcessError as e:
            print(f"Failed to remove Code Store or Cloned Repository: {e}")
            return False

    def clear_history(self):
        self.chat_history = []

    def reinitialize(self, openai_api_key: str, github_url: str, local_path: str, branch: str):
        self.github_url = github_url
        self.local_path = local_path
        self.branch = branch
        self.OPENAI_API_KEY = openai_api_key
        self.documents = None
        last_slash_index = github_url.rfind("/")
        last_suffix_index = github_url.rfind(".git")
        if last_suffix_index < 0:
            last_suffix_index = len(github_url)
        self.persist_directory = github_url[last_slash_index+1 : last_suffix_index]
        self.embedding_function = OpenAIEmbeddings(openai_api_key = openai_api_key)
        self.code_store = None
        self.question_answer = None
        self.chat_history = []


In [147]:
cr = CodeStore(
    openai_api_key=OPENAI_API_KEY[random.randint(0, len(OPENAI_API_KEY)-1)],
    github_url="https://github.com/iSmart-Research-Scholar/Author-Profiling-API",
    local_path="./repo",
    branch="master"
)

In [148]:
cr.generate_documents()

In [149]:
cr.create_vector_store()

In [139]:
# cr.clear_files()

In [9]:
res = cr.search_code(query="The code provided is a Django Application. Look into the file author_profiling/views.py and tell me in brief description about the method used to check topic relevance. Also tell me the code present in that function.")
print(res)



In [10]:
print("Question :", res["question"])
print("Answer : ", res["answer"])
print("Source : ", res["source_documents"][0].page_content, res["source_documents"][1].page_content)

Question : The code provided is a Django Application. Look into the file author_profiling/views.py and tell me in brief description about the method used to check topic relevance. Also tell me the code present in that function.
Answer :  The method used to check topic relevance is called `check_topic_relevance`. It takes two parameters: `keywords` and `publication_topics`. 

Here is the code present in the `check_topic_relevance` function:

```python
def check_topic_relevance(keywords, publication_topics):
    text_tokens = list(keywords)
    publication_topics = list(publication_topics)
    all_stopwords = sp.Defaults.stop_words
    tokens_without_stopwords = [word for word in text_tokens if not word in all_stopwords]
    relevant_topics = [word for word in tokens_without_stopwords for topic in publication_topics if topic == word]
    if len(relevant_topics) == len(tokens_without_stopwords):
        return 0.4
    else:
        return 0.4 * len(relevant_topics) / len(tokens_without_st

In [11]:
a = cr.search_code(query="Look into the file author_profiling/views.py and give approximate weights to the functions citations_per_paper_score, publications_per_year_score, experience_score and check_topic_relevance_score such that the total is 100 percent. You suggest what must be the weights.")
print(a["answer"])

Based on the given code, we can assign approximate weights to the functions as follows:

- citations_per_paper_score: 20%
- publications_per_year_score: 30%
- experience_score: 25%
- check_topic_relevance: 25%

Please note that these weights are just suggestions and can be adjusted based on the specific requirements and priorities of the application.


# VulnerabilityStore


In [150]:
class VulnerabilityStore:

    def __init__(self, openai_api_key:list):

        self.OPENAI_API_KEY = openai_api_key
        self.documents = []
        self.summarizer_llm = HuggingFaceHub(
            repo_id="sshleifer/distilbart-cnn-12-6", model_kwargs={"temperature": 0, "max_length": 10000}
        )
        self.similarity_model = SentenceTransformer('paraphrase-distilroberta-base-v2')
        self.embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.start_urls = [
            "https://owasp.org/Top10/A01_2021-Broken_Access_Control",
            "https://owasp.org/Top10/A02_2021-Cryptographic_Failures",
            "https://owasp.org/Top10/A03_2021-Injection",
            "https://owasp.org/Top10/A04_2021-Insecure_Design",
            "https://owasp.org/Top10/A05_2021-Security_Misconfiguration",
            "https://owasp.org/Top10/A06_2021-Vulnerable_and_Outdated_Components",
            "https://owasp.org/Top10/A07_2021-Identification_and_Authentication_Failures",
            "https://owasp.org/Top10/A08_2021-Software_and_Data_Integrity_Failures",
            "https://owasp.org/Top10/A09_2021-Security_Logging_and_Monitoring_Failures",
            "https://owasp.org/Top10/A10_2021-Server-Side_Request_Forgery_%28SSRF%29"
        ]
        self.labels = []
        self.persist_directories = []
        self.metadata = []
        self.stores: list[Chroma] = []
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.4; Win64; x64) Gecko/20130401 Firefox/60.7",
            "Mozilla/5.0 (Linux; Linux i676 ; en-US) AppleWebKit/602.37 (KHTML, like Gecko) Chrome/53.0.3188.182 Safari/533",
            "Mozilla/5.0 (Android; Android 6.0.1; SM-G928S Build/MDB08I) AppleWebKit/536.43 (KHTML, like Gecko)  Chrome/53.0.1961.231 Mobile Safari/602.4",
            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_1_7) Gecko/20100101 Firefox/48.4",
            "Mozilla/5.0 (Windows NT 10.1; Win64; x64) AppleWebKit/602.6 (KHTML, like Gecko) Chrome/49.0.2764.174 Safari/603",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 9_8_7) Gecko/20130401 Firefox/59.7",
            "Mozilla/5.0 (Linux; U; Android 5.0; LG-D332 Build/LRX22G) AppleWebKit/602.2 (KHTML, like Gecko)  Chrome/54.0.2014.306 Mobile Safari/600.5",
            "Mozilla/5.0 (Android; Android 5.0.2; HTC 80:number1-2w Build/LRX22G) AppleWebKit/600.31 (KHTML, like Gecko)  Chrome/54.0.3164.157 Mobile Safari/603.5",
            "Mozilla/5.0 (Linux; Android 7.1; Nexus 8P Build/NPD90G) AppleWebKit/601.15 (KHTML, like Gecko)  Chrome/47.0.3854.218 Mobile Safari/600.6",
            "Mozilla/5.0 (Android; Android 5.1; SM-G928L Build/LRX22G) AppleWebKit/602.47 (KHTML, like Gecko)  Chrome/48.0.1094.253 Mobile Safari/537.6",
        ]
        self.question_answer = []

    def get_links_from_url(self, url: str):
        req = urllib.request.Request(url, headers={
            "User-Agent": self.user_agents[random.randint(0, 9)]
        })
        html_page = urllib.request.urlopen(req)
        soup = BeautifulSoup(html_page, 'html.parser')
        links = []
        for link in soup.findAll('a'):
            links.append(urljoin(url, link.get('href')))
        return links

    def expand_start_urls(self):
        to = len(self.start_urls)
        for l in range(to):
            try:
                if len(self.start_urls) > 0:
                    break
                newlinks = self.get_links_from_url(url = self.start_urls[l])
                checked_links = []
                for url in newlinks:
                    if len(url) < 10:
                        continue
                    if url[-1] == "/":
                        url = url[:-1]
                    checked_links.append(url)
                checked_links = list(set(checked_links))
                # for i in range(len(checked_links)):
                    # self.labels.append("vulnerability_related")
                self.start_urls.extend(checked_links)
                # if len(self.labels) != len(self.start_urls):
                #     print("Gadbad :)")
            except Exception:
                # print("URL : ", self.start_urls[l], "Error: ", e)
                continue

    def get_documents(self):
        for url in self.start_urls:
            try:
                loader = WebBaseLoader(web_path=url)
                loader.default_parser = "html.parser"
                doc = loader.load()

                parser = HtmlParser.from_url(url, Tokenizer("english"))
                summarizer = LuhnSummarizer()
                summary = str(summarizer(parser.document, sentences_count=4))
                summary = summary.replace("<", "").replace(">", "").replace("Sentence:", "")
                for d in doc:
                    if d.metadata is None:
                        d.metadata = {}
                    d.metadata["summary"] = summary.replace("<", "").replace(">", "").replace("Sentence:", "")
                self.documents.append(doc)
                self.metadata.append(summary)

            except Exception as e:
                print("URL : ", url, "Error : ", e)
                continue

    @staticmethod
    def clear_vector_stores():
        try:
            subprocess.run(["rm", "-r", "./vectorstores"])
            print("Removed vectorstores")
        except subprocess.CalledProcessError:
            print("Failure")


    def create_vector_stores(self):
        # self.clear_vector_stores()
        l = len(self.documents)
        p = 0
        for i in range(l):
            try:
                self.stores.append(
                    Chroma.from_documents(
                        documents=self.documents[i],
                        embedding=self.embedding_function,
                        persist_directory=self.persist_directories[i]
                    )
                )
            except Exception as e:
                print(e, self.documents[i])
                p = p+1
                continue

            self.stores[i-p].persist()
            prompt = PromptTemplate.from_template(
            "Question: {question}"
            "Instructions: \n"
            "Output Format must be:"
            "Explanation : <Give Explanation here> \n"
            "Code: <give the relevant code here>\n"
            "Summary: <give summary here>"
            )
            # make qa
            self.question_answer.append(ConversationalRetrievalChain.from_llm(
                ChatOpenAI(
                    openai_api_key = self.OPENAI_API_KEY[random.randint(0,len(self.OPENAI_API_KEY)-1)],
                    temperature = 0
                ),
                retriever=(self.stores[i-p]).as_retriever(search_kwargs={"k": 1}), condense_question_prompt = prompt
            ))

    def add_url(self, url: str, label: str):
        loader = WebBaseLoader(web_path=url)
        loader.default_parser = "html.parser"
        doc = loader.load()

        if len(doc) == 0:
            return

        parser = HtmlParser.from_url(url, Tokenizer("english"))

        ## LUHN method
        summarizer = LuhnSummarizer()
        summary = str(summarizer(parser.document, sentences_count=4))
        summary = summary.replace("<", "").replace(">", "").replace("Sentence:", "")

        for d in doc:
            try:
                if d.metadata is None or d.metadata == {}:
                    d.metadata = {"error": "metadata was null"}
                d.metadata["summary"] = summary.replace("<", "").replace(">", "").replace("Sentence:", "")
                d.metadata["url"] = url
            except Exception:
                pass
                # print(e)
                # print(d)

        topic_exists = False
        for i in range(len(self.metadata)):
            qe = self.similarity_model.encode(doc[0].metadata["summary"])
            me = self.similarity_model.encode(self.metadata[i])
            score = qe.dot(me) / (np.linalg.norm(qe) * np.linalg.norm(me))
            if score > 0.6:
                topic_exists = True
                print("merged")
                try:
                    for d in doc:
                        self.stores[i].add_documents(d)
                    self.metadata[i] += " " + summary
                except Exception:
                    pass
                    # print(score, i, self.persist_directories[i])
                break

        if not topic_exists:
            per = ""
            if url[-1] == "/":
                url = url[:-1]
            slash = url.rfind("/")
            if slash < 0:
                ext = url.rfind(".")
                per = "./vectorstores/" + url[ext+1:]
            if slash > 0:
                per = "./vectorstores/" + url[slash+1:]
            self.stores.append(
                Chroma.from_documents(
                    documents=doc,
                    embedding=self.embedding_function,
                    persist_directory=per
                )
            )
            self.labels.append(label)
            self.stores[-1].persist()
            self.documents.append(doc)
            self.metadata.append(summary)
            self.persist_directories.append(per)

            prompt = PromptTemplate.from_template(
            "Question: {question}"
            "Instructions: \n"
            "Output Format must be:\n"
            "Explanation : <Give Explanation here> \n"
            "Code: <give the relevant code here>\n"
            "Summary: <give summary here>\n"
            ""
            )
            prompt = PromptTemplate.from_template(
                """
                Question: {question}

                Instruction:
                """
            )

                        # make qa
            self.question_answer.append(ConversationalRetrievalChain.from_llm(
                llm = ChatOpenAI(temperature = 0, openai_api_key = self.OPENAI_API_KEY[random.randint(0,len(self.OPENAI_API_KEY)-1)]),
                retriever=self.stores[-1].as_retriever(search_kwargs={"k": 1}),
                max_tokens_limit=4000, condense_question_prompt = prompt
            ))


    def run_vul(self):
        for url in self.start_urls:
            try:
                self.add_url(url = url, label="vulnerability_related")
            except Exception:
                # print("Error in URL : ", url)
                # print("Skip due to error : ", e)
                continue

            # print(len(self.metadata), len(self.documents), len(self.stores), len(self.persist_directories), len(self.question_answer))
            print("URL : ", url)
            print("Store : ", self.persist_directories[-1])
            # print("Metadata : ", self.metadata[-1])


    def search(self, query:str, label:str):
        similarity_scores = []
        l = len(self.metadata)
        for i in range(l):
            # if store or metadata is not labelled related to the label mentioned then skip
            if self.labels[i] != label:
                similarity_scores.append(0)
                continue

            qe = self.similarity_model.encode(self.metadata[i])
            me = self.similarity_model.encode(query)

            score = qe.dot(me) / (np.linalg.norm(qe) * np.linalg.norm(me))
            similarity_scores.append(score)

        solutions = []
        for i in range(l):
            if similarity_scores[i] > 0.5:
                solutions.append(i)

        outputs = []
        chat_history = []
        if len(solutions) == 0:
            url = "https://www.googleapis.com/customsearch/v1"
            params = {
                "key": "YOUR_GOOGLE_API_KEY",
                "cx": "YOUR_SEARCH_ENGINE_CX_KEY",
                "q": query,
                "num": 10,
                "fields": "items(link)"
            }
            response = requests.get(url, params=params)

            if response.status_code == 200:
                results = response.json()["items"]
                urls = [result["link"] for result in results]
                text_urls = []
                for url in urls:
                    try:
                        response = requests.get(url)
                        if response.status_code == 200 and 'text' in response.headers['Content-Type']:
                            text_urls.append(url)
                        elif response.status_code == 200 and 'text' in response.content.decode('utf-8'):
                            text_urls.append(url)
                    except:
                        pass
                stprev = len(self.stores)
                print(text_urls)
                for i in text_urls:
                    try:
                        self.add_url(i, label)
                        self.start_urls.append(i)
                    except Exception as e:
                        print("URL", i, e)
                stnext = len(self.stores)

                #  label new stores generated with the label provided
                # for i in range(stnext - stprev):
                #     self.labels.append(label)

                try:
                    l = len(self.question_answer)
                    for i in range(l- (stnext - stprev), l):
                        qa = self.question_answer[i]
                        result = qa({"question": query, "chat_history": []})
                        outputs.append(result)
                except Exception:
                    pass
                    # print("Gadbad", e, "\n", "LEngth : ", len(self.start_urls), "QA len : ", len(self.question_answer), " sol len ", len(solutions))

            else:
                outputs.append("No search results found.")

        else:
            try:
                for i in solutions:
                    qa = self.question_answer[i]
                    result = qa({"question": query, "chat_history": []})
                    outputs.append(result)
                    chat_history.append(result)
            except Exception:
                pass
                # print(e)

        return outputs


In [151]:
vul = VulnerabilityStore(openai_api_key=OPENAI_API_KEY)

In [152]:
vul.clear_vector_stores()

Removed vectorstores


In [153]:
vul.expand_start_urls()

In [154]:
print(len(vul.start_urls))

10


In [155]:
vul.run_vul()

URL :  https://owasp.org/Top10/A01_2021-Broken_Access_Control
Store :  ./vectorstores/A01_2021-Broken_Access_Control
URL :  https://owasp.org/Top10/A02_2021-Cryptographic_Failures
Store :  ./vectorstores/A02_2021-Cryptographic_Failures
URL :  https://owasp.org/Top10/A03_2021-Injection
Store :  ./vectorstores/A03_2021-Injection
URL :  https://owasp.org/Top10/A04_2021-Insecure_Design
Store :  ./vectorstores/A04_2021-Insecure_Design
URL :  https://owasp.org/Top10/A05_2021-Security_Misconfiguration
Store :  ./vectorstores/A05_2021-Security_Misconfiguration
URL :  https://owasp.org/Top10/A06_2021-Vulnerable_and_Outdated_Components
Store :  ./vectorstores/A06_2021-Vulnerable_and_Outdated_Components
URL :  https://owasp.org/Top10/A07_2021-Identification_and_Authentication_Failures
Store :  ./vectorstores/A07_2021-Identification_and_Authentication_Failures
URL :  https://owasp.org/Top10/A08_2021-Software_and_Data_Integrity_Failures
Store :  ./vectorstores/A08_2021-Software_and_Data_Integrity_F

In [156]:
print(len(vul.metadata), len(vul.stores), len(vul.persist_directories), len(vul.start_urls), len(vul.labels))

10 10 10 10 10


In [157]:
print(vul.persist_directories)

['./vectorstores/A01_2021-Broken_Access_Control', './vectorstores/A02_2021-Cryptographic_Failures', './vectorstores/A03_2021-Injection', './vectorstores/A04_2021-Insecure_Design', './vectorstores/A05_2021-Security_Misconfiguration', './vectorstores/A06_2021-Vulnerable_and_Outdated_Components', './vectorstores/A07_2021-Identification_and_Authentication_Failures', './vectorstores/A08_2021-Software_and_Data_Integrity_Failures', './vectorstores/A09_2021-Security_Logging_and_Monitoring_Failures', './vectorstores/A10_2021-Server-Side_Request_Forgery_%28SSRF%29']


In [20]:
vul.search("bellman ford algorithm", "algo_related")

['https://www.geeksforgeeks.org/bellman-ford-algorithm-dp-23/', 'https://en.wikipedia.org/wiki/Bellman%E2%80%93Ford_algorithm', 'https://www.programiz.com/dsa/bellman-ford-algorithm', 'https://www.javatpoint.com/bellman-ford-algorithm', 'https://arxiv.org/abs/1111.5414', 'https://practice.geeksforgeeks.org/problems/distance-from-the-source-bellman-ford-algorithm/1', 'https://community.cisco.com/t5/routing/bellman-ford-algorithm-and-dijkstras/td-p/1931574', 'https://stackoverflow.com/questions/14785413/can-we-apply-the-bellman-ford-algorithm-to-an-undirected-graph']
merged
merged
merged
URL https://arxiv.org/abs/1111.5414 Expected metadata value to be a str, int, or float, got None
merged
merged


[{'question': 'bellman ford algorithm',
  'chat_history': [],
  'answer': "The Bellman-Ford algorithm is a graph algorithm used to find the shortest path between a source vertex and all other vertices in a weighted directed graph. It can handle negative edge weights, unlike Dijkstra's algorithm. The algorithm works by iteratively relaxing the edges of the graph until it finds the shortest path for each vertex. If there is a negative cycle in the graph, the algorithm can detect it."},
 {'question': 'bellman ford algorithm',
  'chat_history': [],
  'answer': "The Bellman-Ford algorithm is a graph traversal algorithm used to find the shortest path between a source vertex and all other vertices in a weighted graph. It can handle negative edge weights, unlike Dijkstra's algorithm. The algorithm works by iteratively relaxing the edges of the graph until it finds the shortest path. If there is a negative cycle in the graph, the algorithm can detect it."}]

In [21]:
ans = vul.search("Give an explanation for the coin change problem along with a real life example. Also try to mention some code.", label="algo_related")

['https://www.freecodecamp.org/news/coding-interviews-for-dummies-5e048933b82b/', 'https://www.geeksforgeeks.org/coin-change-dp-7/', 'https://www.bloomberg.com/features/2022-the-crypto-story/', 'https://www.geeksforgeeks.org/understanding-the-coin-change-problem-with-dynamic-programming/', 'https://realpython.com/linear-programming-python/', 'https://jobs.netflix.com/culture', 'https://dfpi.ca.gov/crypto-scams/', 'https://www.simplilearn.com/tutorials/data-structure-tutorial/what-is-an-algorithm']
merged


In [22]:
print(ans[0]["answer"])

The coin change problem is a classic algorithmic problem that involves finding the minimum number of coins needed to make a certain amount of change. The problem can be stated as follows: given a set of coin denominations and a target amount, find the minimum number of coins needed to make the target amount.

For example, let's say we have coins of denominations [1, 5, 10, 25] and we want to make a change for 36 cents. The minimum number of coins needed would be 3 (25 + 10 + 1).

Here's a possible approach to solve the coin change problem using dynamic programming in Python:

```python
def coinChange(coins, amount):
    # Create a table to store the minimum number of coins needed for each amount
    dp = [float('inf')] * (amount + 1)
    dp[0] = 0

    # Iterate through all amounts from 1 to the target amount
    for i in range(1, amount + 1):
        # Check each coin denomination
        for coin in coins:
            if coin <= i:
                # Update the minimum number of coins

In [23]:
vul.search("Coin Change Problem", label="algo_related")

[{'question': 'Coin Change Problem',
  'chat_history': [],
  'answer': "The Coin Change Problem is a classic algorithmic problem in computer science. It involves finding the minimum number of coins needed to make a certain amount of change. For example, given a set of coin denominations and a target amount, the goal is to find the minimum number of coins needed to make that amount.\n\nThere are various approaches to solving the Coin Change Problem, including dynamic programming and recursive algorithms. The dynamic programming approach involves building a table to store the minimum number of coins needed for each subproblem, while the recursive approach involves exploring all possible combinations of coins.\n\nIf you have a specific question about the Coin Change Problem, please let me know and I'll do my best to help you."}]

# General Code Store

In [158]:
class GeneralStore:

    def __init__(self, openai_api_key:list):

        self.OPENAI_API_KEY = openai_api_key
        self.documents = []
        self.summarizer_llm = HuggingFaceHub(
            repo_id="sshleifer/distilbart-cnn-12-6", model_kwargs={"temperature": 0, "max_length": 10000}
        )
        self.similarity_model = SentenceTransformer('paraphrase-distilroberta-base-v2')
        self.embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.start_urls = [
            # "https://owasp.org/Top10/A01_2021-Broken_Access_Control",
            # "https://owasp.org/Top10/A02_2021-Cryptographic_Failures",
            # "https://owasp.org/Top10/A03_2021-Injection",
            # "https://owasp.org/Top10/A04_2021-Insecure_Design",
            # "https://owasp.org/Top10/A05_2021-Security_Misconfiguration",
            # "https://owasp.org/Top10/A06_2021-Vulnerable_and_Outdated_Components",
            # "https://owasp.org/Top10/A07_2021-Identification_and_Authentication_Failures",
            # "https://owasp.org/Top10/A08_2021-Software_and_Data_Integrity_Failures",
            # "https://owasp.org/Top10/A09_2021-Security_Logging_and_Monitoring_Failures",
            # "https://owasp.org/Top10/A10_2021-Server-Side_Request_Forgery_%28SSRF%29"
        ]
        self.labels = []
        self.persist_directories = []
        self.metadata = []
        self.stores: list[Chroma] = []
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.4; Win64; x64) Gecko/20130401 Firefox/60.7",
            "Mozilla/5.0 (Linux; Linux i676 ; en-US) AppleWebKit/602.37 (KHTML, like Gecko) Chrome/53.0.3188.182 Safari/533",
            "Mozilla/5.0 (Android; Android 6.0.1; SM-G928S Build/MDB08I) AppleWebKit/536.43 (KHTML, like Gecko)  Chrome/53.0.1961.231 Mobile Safari/602.4",
            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_1_7) Gecko/20100101 Firefox/48.4",
            "Mozilla/5.0 (Windows NT 10.1; Win64; x64) AppleWebKit/602.6 (KHTML, like Gecko) Chrome/49.0.2764.174 Safari/603",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 9_8_7) Gecko/20130401 Firefox/59.7",
            "Mozilla/5.0 (Linux; U; Android 5.0; LG-D332 Build/LRX22G) AppleWebKit/602.2 (KHTML, like Gecko)  Chrome/54.0.2014.306 Mobile Safari/600.5",
            "Mozilla/5.0 (Android; Android 5.0.2; HTC 80:number1-2w Build/LRX22G) AppleWebKit/600.31 (KHTML, like Gecko)  Chrome/54.0.3164.157 Mobile Safari/603.5",
            "Mozilla/5.0 (Linux; Android 7.1; Nexus 8P Build/NPD90G) AppleWebKit/601.15 (KHTML, like Gecko)  Chrome/47.0.3854.218 Mobile Safari/600.6",
            "Mozilla/5.0 (Android; Android 5.1; SM-G928L Build/LRX22G) AppleWebKit/602.47 (KHTML, like Gecko)  Chrome/48.0.1094.253 Mobile Safari/537.6",
        ]
        self.question_answer = []

    def get_links_from_url(self, url: str):
        req = urllib.request.Request(url, headers={
            "User-Agent": self.user_agents[random.randint(0, 9)]
        })
        html_page = urllib.request.urlopen(req)
        soup = BeautifulSoup(html_page, 'html.parser')
        links = []
        for link in soup.findAll('a'):
            links.append(urljoin(url, link.get('href')))
        return links

    def expand_start_urls(self):
        to = len(self.start_urls)
        for l in range(to):
            try:
                if len(self.start_urls) > 0:
                    break
                newlinks = self.get_links_from_url(url = self.start_urls[l])
                checked_links = []
                for url in newlinks:
                    if len(url) < 10:
                        continue
                    if url[-1] == "/":
                        url = url[:-1]
                    checked_links.append(url)
                checked_links = list(set(checked_links))
                # for i in range(len(checked_links)):
                    # self.labels.append("vulnerability_related")
                self.start_urls.extend(checked_links)
                # if len(self.labels) != len(self.start_urls):
                #     print("Gadbad :)")
            except Exception:
                # print("URL : ", self.start_urls[l], "Error: ", e)
                continue

    def get_documents(self):
        for url in self.start_urls:
            try:
                loader = WebBaseLoader(web_path=url)
                loader.default_parser = "html.parser"
                doc = loader.load()

                parser = HtmlParser.from_url(url, Tokenizer("english"))
                summarizer = LuhnSummarizer()
                summary = str(summarizer(parser.document, sentences_count=4))
                summary = summary.replace("<", "").replace(">", "").replace("Sentence:", "")
                for d in doc:
                    if d.metadata is None:
                        d.metadata = {}
                    d.metadata["summary"] = summary.replace("<", "").replace(">", "").replace("Sentence:", "")
                self.documents.append(doc)
                self.metadata.append(summary)

            except Exception as e:
                print("URL : ", url, "Error : ", e)
                continue

    @staticmethod
    def clear_vector_stores():
        try:
            subprocess.run(["rm", "-r", "./generalstore"])
            print("Removed generalstore")
        except subprocess.CalledProcessError:
            print("Failure")


    def create_vector_stores(self):
        # self.clear_vector_stores()
        l = len(self.documents)
        p = 0
        for i in range(l):
            try:
                self.stores.append(
                    Chroma.from_documents(
                        documents=self.documents[i],
                        embedding=self.embedding_function,
                        persist_directory=self.persist_directories[i]
                    )
                )
            except Exception as e:
                print(e, self.documents[i])
                p = p+1
                continue

            self.stores[i-p].persist()
            prompt = PromptTemplate.from_template(
            "Question: {question}"
            "Instructions: \n"
            "Output Format must be:"
            "Explanation : <Give Explanation here> \n"
            "Code: <give the relevant code here>\n"
            "Summary: <give summary here>"
            )
            # make qa
            self.question_answer.append(ConversationalRetrievalChain.from_llm(
                ChatOpenAI(
                    openai_api_key = self.OPENAI_API_KEY[random.randint(0,len(self.OPENAI_API_KEY)-1)],
                    temperature = 0
                ),
                retriever=(self.stores[i-p]).as_retriever(search_kwargs={"k": 1}), condense_question_prompt = prompt
            ))

    def add_url(self, url: str, label: str):
        loader = WebBaseLoader(web_path=url)
        loader.default_parser = "html.parser"
        doc = loader.load()

        if len(doc) == 0:
            return

        parser = HtmlParser.from_url(url, Tokenizer("english"))

        ## LUHN method
        summarizer = LuhnSummarizer()
        summary = str(summarizer(parser.document, sentences_count=4))
        summary = summary.replace("<", "").replace(">", "").replace("Sentence:", "")

        for d in doc:
            try:
                if d.metadata is None or d.metadata == {}:
                    d.metadata = {"error": "metadata was null"}
                d.metadata["summary"] = summary.replace("<", "").replace(">", "").replace("Sentence:", "")
                d.metadata["url"] = url
            except Exception:
                pass
                # print(e)
                # print(d)

        topic_exists = False
        for i in range(len(self.metadata)):
            qe = self.similarity_model.encode(doc[0].metadata["summary"])
            me = self.similarity_model.encode(self.metadata[i])
            score = qe.dot(me) / (np.linalg.norm(qe) * np.linalg.norm(me))
            if score > 0.6:
                topic_exists = True
                print("merged")
                try:
                    for d in doc:
                        self.stores[i].add_documents(d)
                    self.metadata[i] += " " + summary
                except Exception:
                    pass
                    # print(score, i, self.persist_directories[i])
                break

        if not topic_exists:
            per = ""
            if url[-1] == "/":
                url = url[:-1]
            slash = url.rfind("/")
            if slash < 0:
                ext = url.rfind(".")
                per = "./generalstore/" + url[ext+1:]
            if slash > 0:
                per = "./generalstore/" + url[slash+1:]
            self.stores.append(
                Chroma.from_documents(
                    documents=doc,
                    embedding=self.embedding_function,
                    persist_directory=per
                )
            )
            self.labels.append(label)
            self.stores[-1].persist()
            self.documents.append(doc)
            self.metadata.append(summary)
            self.persist_directories.append(per)

            prompt = PromptTemplate.from_template(
            "Question: {question}"
            "Instructions: \n"
            "Output Format must be:\n"
            "Explanation : <Give Explanation here> \n"
            "Code: <give the relevant code here>\n"
            "Summary: <give summary here>\n"
            ""
            )

                        # make qa
            self.question_answer.append(ConversationalRetrievalChain.from_llm(
                llm = ChatOpenAI(temperature = 0, openai_api_key = self.OPENAI_API_KEY[random.randint(0,len(self.OPENAI_API_KEY)-1)]),
                retriever=self.stores[-1].as_retriever(search_kwargs={"k": 1}),
                max_tokens_limit=4000, condense_question_prompt = prompt
            ))


    def run_vul(self):
        for url in self.start_urls:
            try:
                self.add_url(url = url, label="vulnerability_related")
            except Exception:
                # print("Error in URL : ", url)
                # print("Skip due to error : ", e)
                continue

            # print(len(self.metadata), len(self.documents), len(self.stores), len(self.persist_directories), len(self.question_answer))
            print("URL : ", url)
            print("Store : ", self.persist_directories[-1])
            # print("Metadata : ", self.metadata[-1])


    def search(self, query:str, label:str):
        similarity_scores = []
        l = len(self.metadata)
        for i in range(l):
            # if store or metadata is not labelled related to the label mentioned then skip
            if self.labels[i] != label:
                similarity_scores.append(0)
                continue

            qe = self.similarity_model.encode(self.metadata[i])
            me = self.similarity_model.encode(query)

            score = qe.dot(me) / (np.linalg.norm(qe) * np.linalg.norm(me))
            similarity_scores.append(score)

        solutions = []
        for i in range(l):
            if similarity_scores[i] > 0.5:
                solutions.append(i)

        outputs = []
        chat_history = []
        if len(solutions) == 0:
            url = "https://www.googleapis.com/customsearch/v1"
            params = {
                "key": "YOUR_GOOGLE_API_KEY",
                "cx": "YOUR_SEARCH_ENGINE_CX_KEY",
                "q": query,
                "num": 10,
                "fields": "items(link)"
            }
            response = requests.get(url, params=params)

            if response.status_code == 200:
                results = response.json()["items"]
                urls = [result["link"] for result in results]
                text_urls = []
                for url in urls:
                    try:
                        response = requests.get(url)
                        if response.status_code == 200 and 'text' in response.headers['Content-Type']:
                            text_urls.append(url)
                        elif response.status_code == 200 and 'text' in response.content.decode('utf-8'):
                            text_urls.append(url)
                    except:
                        pass
                stprev = len(self.stores)
                print(text_urls)
                for i in text_urls:
                    try:
                        self.add_url(i, label)
                        self.start_urls.append(i)
                    except Exception as e:
                        print("URL", i, e)
                stnext = len(self.stores)

                #  label new stores generated with the label provided
                # for i in range(stnext - stprev):
                #     self.labels.append(label)

                try:
                    l = len(self.question_answer)
                    for i in range(l- (stnext - stprev), l):
                        qa = self.question_answer[i]
                        result = qa({"question": query, "chat_history": []})
                        outputs.append(result)
                except Exception:
                    pass

            else:
                outputs.append("No search results found.")

        else:
            try:
                for i in solutions:
                    qa = self.question_answer[i]
                    result = qa({"question": query, "chat_history": []})
                    outputs.append(result)
                    chat_history.append(result)
            except Exception:
                pass

        return outputs

In [159]:
gs = GeneralStore(openai_api_key=OPENAI_API_KEY)

In [53]:
gs.search("How to get Started with Jira", "general_chat")

[{'question': 'How to get Started with Jira',
  'chat_history': [],
  'answer': "To get started with Jira, you can follow these steps:\n\n1. Sign up for a Jira account: Visit the Atlassian website and sign up for a Jira account. You can choose between Jira Software, Jira Service Management, or Jira Work Management, depending on your needs.\n\n2. Set up your project: Once you have your Jira account, you can create a new project. Define the project name, key, and project template. You can choose from various templates based on your project type.\n\n3. Create issues: In Jira, work is tracked through issues. Create issues to represent tasks, bugs, or any other work items that need to be completed. Assign the issues to team members and set priorities.\n\n4. Customize your workflow: Jira allows you to customize your workflow to match your team's processes. Add statuses, transitions, and conditions to reflect how work moves through different stages.\n\n5. Collaborate and track progress: Use J

# Classifier

In [160]:
dataset = pd.read_csv("code_chk_data.csv", names=["Query", "Type"])
print(dataset)

                                                 Query                   Type
0    How do you implement a depth-first search algo...      code_algo_related
1     Forced Browsing Protection mechanisms can imp...  vulnerability_related
2     Insecure Deserialization Protection mechanism...  vulnerability_related
3    Can you explain the concept of inheritance in ...      code_algo_related
4    Have you ever taken a mindfulness or meditatio...           general_chat
..                                                 ...                    ...
595   Remote File Inclusion (RFI) Protection mechan...  vulnerability_related
596   Integer Overflow Protection mechanisms can en...  vulnerability_related
597  What are the benefits of using a red-black tre...      code_algo_related
598  How do you traverse a binary tree in preorder,...      code_algo_related
599   API Security vulnerabilities can expose sensi...  vulnerability_related

[600 rows x 2 columns]


In [161]:
class ClassifierModel:
    def __init__(self, data, criteria, model_name="all-MiniLM-L6-v2"):
        self.scaler = None
        self.embeddings = None
        self.classifier = None
        self.Y_test = None
        self.Y_train = None
        self.X_test = None
        self.X_train = None
        self.Y = None
        self.X = None
        self.punctuations = None
        self.nlp = spacy.load("en_core_web_sm")
        self.stop_words = self.nlp.Defaults.stop_words
        # filtered_dataset = data.loc[(data["Type"] == criteria[0]) | (data["Type"] == criteria[1])]
        # self.dataset = filtered_dataset
        self.dataset = data
        self.criteria = criteria
        self.model = SentenceTransformer(model_name)

    def split(self):
        if len(self.X) != len(self.Y):
            print("Length of X not equal to Y", len(self.X), len(self.Y), len(dataset), len(self.embeddings))
            return
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.Y, test_size=0.20)

    def tokenizer(self, sentence):
        words = self.nlp(sentence)
        self.punctuations = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
        tokens = [word.lemma_.lower().strip() for word in words]
        tokens = [word for word in tokens if word not in self.stop_words and word not in self.punctuations]
        sentence = " ".join(tokens)
        return sentence

    def tokenize(self):
        self.dataset["Tokenized"] = self.dataset["Query"].apply(self.tokenizer)

    def encoder(self):
        self.embeddings = self.dataset["Tokenized"].apply(self.model.encode)
        self.X = self.embeddings.to_list()
        self.scaler = LabelEncoder().fit(list(dataset["Type"]))
        self.Y = self.scaler.transform(list(dataset["Type"]))

    def train_classifier(self):
        self.classifier = SVC(kernel="rbf", random_state=0)
        self.classifier.fit(self.X_train, self.Y_train)

    def check_test(self):
        Y_predict = self.classifier.predict(self.X_test)
        cm = confusion_matrix(self.Y_test, Y_predict)
        print("Confusion matrix", cm)
        print("Accuracy Score : ", accuracy_score(self.Y_test, Y_predict))
        print("Precision Score : ", precision_score(self.Y_test, Y_predict, average='macro'))
        print("Recall Score : ", recall_score(self.Y_test, Y_predict, average='macro', zero_division=0))

    def predict_classification(self, query: str) -> str:
        tokenized_input = self.tokenizer(sentence=query)
        embedding_input = self.model.encode(sentences=tokenized_input)
        prediction = self.classifier.predict([embedding_input])
        # print(self.scaler.inverse_transform(prediction))
        return self.scaler.inverse_transform(prediction)[0]

In [162]:
classifier = ClassifierModel(data=dataset, criteria=["code_algo_related", "vulnerability_related", "general_chat"])

In [163]:
classifier.tokenize()
classifier.encoder()
classifier.split()
classifier.train_classifier()
classifier.check_test()

Confusion matrix [[38  0  0]
 [ 0 40  0]
 [ 0  0 42]]
Accuracy Score :  1.0
Precision Score :  1.0
Recall Score :  1.0


In [28]:
print(classifier.predict_classification(query="SQL Injection."))

vulnerability_related


In [29]:
classifier.predict_classification(query="Jack and Jill")

'general_chat'

In [30]:
classifier.predict_classification(query="Bellman ford algorithm")

'code_algo_related'

In [31]:
classifier.predict_classification(query="KMP of strings")

'code_algo_related'

In [32]:
classifier.predict_classification(query="#include<bits/stdc++.h>")

'code_algo_related'

In [33]:
classifier.predict_classification(query="dp[i][j]=1+max(dp[i-1][j-1],dp[i-1][j])")

'code_algo_related'

In [34]:
classifier.predict_classification(query="JWT Attack")

'vulnerability_related'

# Tools

In [58]:
# class VulInputSchema(BaseModel):
#     query: str = Field(...)
#     label: str = Field(...)
#
#     @root_validator
#     def label_in(cls, values):
#         l = values["label"]
#         # q = values["query"]
#         if l not in ['code_related', 'algo_related', 'vulnerability_related']:
#             raise ValueError(
#                 "Label not in :",
#                 "1. code_related\n 2. algo_related\n3. vulnerability_related"
#             )
#         return values

In [164]:
class vulSearch(BaseTool):
  name: str = "VulSearch"
  description: str = "Use this tool to search for a Code Vulnerability when provided with a query"
  # args_schema: Type[VulInputSchema] = VulInputSchema

  # def _run(self, query:str, label: str):
  def _run(self, query: str):
      time.sleep(30)
      return vul.search(query=query, label="vulnerability_related")

  def _arun(self) -> NotImplementedError:
      return NotImplementedError("This tool does not implement asynchronous ouputs.")

In [165]:
class codeVectorStoreTool(BaseTool):
    name = "Code Vector Store"
    description = "Use this tool to view and analyze the code."

    def _run(self, query: str):
        time.sleep(30)
        return cr.search_code(query)

    def _arun(self) -> NotImplementedError:
        return NotImplementedError("This tool does not require Async output.")

In [166]:
class generalSearch(BaseTool):
  name: str = "General Chat Search"
  description: str = "Use this tool to search for general information to retrieve from web when provided with a query"
  # args_schema: Type[VulInputSchema] = VulInputSchema

  # def _run(self, query:str, label: str):
  def _run(self, query: str):
      time.sleep(30)
      return gs.search(query=query, label="general_chat")

  def _arun(self) -> NotImplementedError:
      return NotImplementedError("This tool does not implement asynchronous ouputs.")

In [167]:
class classifierTool(BaseTool):
    name = "Query Classifier Tool"
    description = "This is the first tool that you must use to know whether the query is related to code or vulnerability."

    def _run(self, query: str):
        return classifier.predict_classification(query=query)

    def _arun(self) -> NotImplementedError:
        return NotImplementedError("This tool does not have async output option.")

In [168]:
# class CVOutputParser(BaseModel):
#     output: str = Field(description="This is the output of the parser it must include explanation with either code or vulnerability")
#
#     @validator('output')
#     def output_includes(cls, field):
#         if "Explanation:" in field and ("Code:" in field or "Vulnerability:" in field):
#             return field
#         raise ValueError("Output does not include Explanation, Code or Vulnerability.")

In [169]:
# parser = PydanticOutputParser(pydantic_object=CVOutputParser)

In [170]:
# prompt = PromptTemplate(
#     template="Answer the user query.\n{format_instructions}\n{query}\n",
#     input_variables=["query"],
#     partial_variables={"format_instructions": parser.get_format_instructions()},
# )

In [171]:
tools = [classifierTool(), codeVectorStoreTool(), vulSearch(), generalSearch()]

In [172]:
model = ChatOpenAI(
    openai_api_key = OPENAI_API_KEY[random.randint(0, len(OPENAI_API_KEY)-1)],
    # openai_api_key = OPENAI_API_KEY[2],
    temperature = 0
)

In [173]:
from langchain.experimental import load_chat_planner, load_agent_executor, PlanAndExecute

planner = load_chat_planner(model)
executor = load_agent_executor(model, tools, verbose=True)

In [174]:
# agent = initialize_agent(
    # agent = AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    # agent = AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    # tools = tools,
    # llm = model,
    # verbose = True,
    # handle_parsing_errors=True,
    # output_parser = parser
# )
agent = PlanAndExecute(planner=planner, executor=executor, verbose=True)

In [77]:
agent("Tell me about bellman ford algorithm.")



[1m> Entering new PlanAndExecute chain...[0m
steps=[Step(value='Explain the purpose of the Bellman-Ford algorithm.'), Step(value='Describe the basic idea behind the algorithm.'), Step(value='Explain the steps involved in the Bellman-Ford algorithm.'), Step(value='Discuss the time complexity of the algorithm.'), Step(value='Provide an example to illustrate the working of the Bellman-Ford algorithm.'), Step(value="Given the above steps taken, please respond to the user's original question. \n")]

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
{
  "action": "General Chat Search",
  "action_input": {
    "query": "purpose of the Bellman-Ford algorithm"
  }
}[0m

[1m> Finished chain.[0m
*****

Step: Explain the purpose of the Bellman-Ford algorithm.

Response: Action:
{
  "action": "General Chat Search",
  "action_input": {
    "query": "purpose of the Bellman-Ford algorithm"
  }
}

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action

OutputParserException: Could not parse LLM output: The search for an example of the Bellman-Ford algorithm returned the following information:

The Bellman-Ford algorithm is used to find the shortest path between a source node and all other nodes in a weighted graph. Here is an example of how the Bellman-Ford algorithm works:

Let's consider the following graph:

```
   A
  / \
 /   \
B-----C
 \   /
  \ /
   D
```

Each edge in the graph has a weight associated with it. We want to find the shortest path from node A to all other nodes.

1. Initialization:
   - Set the distance of the source node (A) to 0, and the distance of all other nodes to infinity.
   - Set the predecessor of all nodes to null.

2. Relaxation:
   - Iterate through all edges in the graph (in this case, 4 iterations).
   - For each edge (u, v) with weight w, if the distance from the source node to u plus the weight of the edge is less than the current distance to v, update the distance to v and set the predecessor of v to u.

   Iteration 1:
   - Distance[A] = 0 (source node)
   - Distance[B] = Distance[C] = Distance[D] = infinity
   - Predecessor[A] = null

   Iteration 2:
   - Distance[A] = 0
   - Distance[B] = 1 (A -> B)
   - Distance[C] = Distance[D] = infinity
   - Predecessor[B] = A

   Iteration 3:
   - Distance[A] = 0
   - Distance[B] = 1
   - Distance[C] = 2 (A -> B -> C)
   - Distance[D] = infinity
   - Predecessor[C] = B

   Iteration 4:
   - Distance[A] = 0
   - Distance[B] = 1
   - Distance[C] = 2
   - Distance[D] = 3 (A -> B -> C -> D)
   - Predecessor[D] = C

3. Negative Cycle Detection:
   - After the relaxation step, if there are still updates to the distances in any iteration, it indicates the presence of a negative cycle in the graph. In such cases, the Bellman-Ford algorithm cannot find a valid shortest path.

In our example, after the 4th iteration, there are no further updates to the distances. Therefore, the shortest path from node A to all other nodes is:

- A to B with a distance of 1
- A to C with a distance of 2
- A to D with a distance of 3

I hope this example helps illustrate the working of the Bellman-Ford algorithm! Let me know if you have any further questions.

In [88]:
agent("Tell me in detail about Dijkstra algorithm also provide some code and real life example for explanation. You might get the code in ``` . ```.")



[1m> Entering new PlanAndExecute chain...[0m


KeyboardInterrupt: 

In [116]:
agent("The code provided is a Django Application. Look into the file author_profiling/wsgi.py and tell me in brief description about the functions present. Also tell me the code present in that function. In addition, try to figure out vulnerabilities that can be inserted into the give code file(for security purposes). Return NOT MORE THAN 3000 tokens.")



[1m> Entering new PlanAndExecute chain...[0m
steps=[Step(value='Read the contents of the file author_profiling/wsgi.py.'), Step(value='Identify the functions present in the file.'), Step(value='For each function, provide a brief description and the code present in that function.'), Step(value='Analyze the code for potential vulnerabilities.'), Step(value='Summarize the findings in a response.'), Step(value="Given the above steps taken, respond to the user's original question.\n\n")]

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "Code Vector Store",
  "action_input": {
    "query": "author_profiling/wsgi.py"
  }
}
```[0m
Thought:[32;1m[1;3mThe previous step involved using the Code Vector Store tool to read the contents of the file `author_profiling/wsgi.py`. The observation shows that the file is the WSGI (Web Server Gateway Interface) configuration file for the `author_profiling` Django project. It exposes the WSGI callable as a module-lev

{'input': 'The code provided is a Django Application. Look into the file author_profiling/wsgi.py and tell me in brief description about the functions present. Also tell me the code present in that function. In addition, try to figure out vulnerabilities that can be inserted into the give code file(for security purposes). Return NOT MORE THAN 3000 tokens.',
 'output': 'Based on the available information, we were unable to access the content of the `author_profiling/views.py` file. Therefore, we cannot provide a summary of the findings in a response. However, we can provide general guidance on how to analyze code for potential vulnerabilities. Would you like me to proceed with that?'}

In [145]:
agent("The code provided is a Django Application. Look into the github repository (it has functions check_topic_relevance() and citations_per_paper_score() and some other functions as well) and tell me in brief description about the functions present. In addition, try to figure out vulnerabilities that can be inserted into the give code file(for security purposes). Return NOT MORE THAN 3000 tokens. IN CASE YOU ASK SOME QUESTION THE ANSWER IS ALWAYS `YES`.")



[1m> Entering new PlanAndExecute chain...[0m
steps=[Step(value='Read the code file from the provided GitHub repository.'), Step(value='Identify the functions present in the code, including check_topic_relevance() and citations_per_paper_score().'), Step(value='Provide a brief description of each function.'), Step(value='Analyze the code for potential vulnerabilities.'), Step(value='Identify and describe any security vulnerabilities found.'), Step(value="Given the above steps taken, respond to the user's original question.\n\n")]

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
{
  "action": "Code Vector Store",
  "action_input": {
    "query": "Read the code file from the provided GitHub repository."
  }
}[0m

[1m> Finished chain.[0m
*****

Step: Read the code file from the provided GitHub repository.

Response: Action:
{
  "action": "Code Vector Store",
  "action_input": {
    "query": "Read the code file from the provided GitHub repository."
  }
}

[1m> Ent

OutputParserException: Could not parse LLM output: The functions present in the code are:

1. publications_per_year_score(publication, start_year, end_year): This function calculates the score based on the number of publications per year within a given range. It returns a score between 0 and 0.15.

2. experience_score(start_year, end_year): This function calculates the score based on the duration of experience. It returns a score between 0 and 0.15.

3. check_topic_relevance(keywords, publication_topics): This function checks the relevance of keywords to the publication topics. It returns a score between 0 and 0.4.

4. citations_per_paper_score(citation_count, publication_count): This function calculates the score based on the number of citations per paper. It returns a score between 0 and 0.3.

Note: The code provided does not include the implementation of the citations_per_paper_score() function.

Action:
```json
{
  "action": "Final Answer",
  "action_input": "The functions present in the code are:\n\n1. publications_per_year_score(publication, start_year, end_year): This function calculates the score based on the number of publications per year within a given range. It returns a score between 0 and 0.15.\n\n2. experience_score(start_year, end_year): This function calculates the score based on the duration of experience. It returns a score between 0 and 0.15.\n\n3. check_topic_relevance(keywords, publication_topics): This function checks the relevance of keywords to the publication topics. It returns a score between 0 and 0.4.\n\n4. citations_per_paper_score(citation_count, publication_count): This function calculates the score based on the number of citations per paper. It returns a score between 0 and 0.3.\n\nNote: The code provided does not include the implementation of the citations_per_paper_score() function."
}
```


In [175]:
agent("The code provided is a Django Application. Check for all the accessible functions and generate a brief description about it. In addition, try to figure out vulnerabilities that can be inserted into the give code file(for security purposes). Return NOT MORE THAN 3000 tokens. IN CASE YOU ASK SOME QUESTION THE ANSWER IS ALWAYS `YES`.")



[1m> Entering new PlanAndExecute chain...[0m
steps=[Step(value='Read the Django application code file.'), Step(value='Identify all the functions defined in the code.'), Step(value='Generate a brief description for each function.'), Step(value='Analyze the code for potential vulnerabilities.'), Step(value='Identify and describe any vulnerabilities found.'), Step(value="Return the descriptions of the functions and vulnerabilities, limited to 3000 tokens.\n\nGiven the above steps taken, please respond to the user's original question.")]

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "Code Vector Store",
  "action_input": {
    "query": "Django application code file"
  }
}
```[0m
Thought:[32;1m[1;3mThe Django application code file consists of several parts:

1. WSGI Configuration: This section configures the WSGI (Web Server Gateway Interface) for the Django application. It exposes the WSGI callable as a module-level variable named "application

InvalidRequestError: This model's maximum context length is 4097 tokens. However, your messages resulted in 4131 tokens. Please reduce the length of the messages.

In [134]:
for i in cr.documents:
    if i.metadata["file_name"] == 'views.py':
        print(i)

page_content='from multiprocessing.connection import wait\nfrom django.http import JsonResponse\nfrom rest_framework.decorators import api_view\nfrom rest_framework.response import Response\nfrom rest_framework import status\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom webdriver_manager.chrome import ChromeDriverManager\nfrom selenium.webdriver.chrome.options import Options\n# from django_thread import Thread as threading\nimport threading\nimport re\nimport spacy\nimport json\nimport time\nsp = spacy.load(\'en_core_web_sm\')\n\ngChromeOptions = webdriver.ChromeOptions()\ngChromeOptions.add_argument("window-size=1920x1480")\ngChromeOptions.add_argument("disable-dev-shm-usage")\ngChromeOptions.add_argument(\'--no-sandbox\')\ngChromeOptions.add_argument("--disable-gpu")\ngChromeOptions.add_argument("--headless")\n\ndef check_position(biography):\n    positions = [\'president\', \'vice president\', \'chair\', \'general chair\', \'secretary\', \'progra