In [3]:
# in this UI is changed in to a window 
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import openai
import tkinter as tk
from tkinter import scrolledtext
from openai import OpenAI


class Chatbot:
    def __init__(self, file_paths, api_key):
        self.file_paths = file_paths
        openai.api_key = api_key
        self.knowledge_base = self.load_data()
        self.documents = self.prepare_documents()
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)
        self.full_documents = self.knowledge_base
        self.cve_index = self.create_cve_index()
        self.conversation_history = []
        self.setup_interface()

    def load_data(self):    
        all_data = []
        for file_path in self.file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                all_data.extend(data)
        return all_data

    def concatenate_text(self, json_obj):
        fields = ['CVE_ID', 'Assigner', 'Description']
        concatenated_text = ' '.join(str(json_obj[field]) for field in fields if field in json_obj)
        return concatenated_text

    def prepare_documents(self):
        return [self.concatenate_text(item) for item in self.knowledge_base]

    def create_cve_index(self):
        cve_index = {}
        for idx, item in enumerate(self.knowledge_base):
            cve_id = item.get('CVE_ID')
            if cve_id:
                cve_index[cve_id] = idx
        return cve_index

    def answer_question(self, query):
        query_words = query.split()
        found_documents = []

        # Check if any word in the query is a CVE_ID
        for word in query_words:
            if word in self.cve_index:
                index = self.cve_index[word]
                found_documents.append(self.full_documents[index])

        # If any CVE_ID is found, return the corresponding documents
        if found_documents:
            return found_documents

        # If no CVE_ID is found, proceed with TF-IDF similarity search
        similarity_scores = np.zeros(len(self.documents))

        for word in query_words:
            query_vec = self.vectorizer.transform([word])
            similarities = cosine_similarity(query_vec, self.tfidf_matrix)[0]
            similarity_scores += similarities

        top_indices = np.argsort(similarity_scores)[-10:][::-1]
        top_documents = [self.full_documents[i] for i in top_indices]
        return top_documents

    def should_use_previous_context(self, question):
        client = OpenAI()
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Determine if the following question is a follow-up question."},
                {"role": "user", "content": question}
            ]
        )
        response = completion.choices[0].message.content.strip().lower()
        return 'yes' in response

    def handle_question(self):
        question = self.text_input.get()
        self.conversation_history.append({"role": "user", "content": question})

        # if self.should_use_previous_context(question):
        #     messages = [
        #         {"role": "system", "content": "You are an assistant that helps with CVE data. Respond with relevant CVE details. Use previous context as well"}
        #     ] + self.conversation_history
        # else:
        #     answers = self.answer_question(question)
        #     json_string = json.dumps(answers, indent=4)
        #     messages = [
        #         {"role": "system", "content": "You are an assistant that helps with CVE data. Respond with relevant CVE details."}
        #     ] + self.conversation_history + [
        #         {"role": "assistant", "content": json_string}
        #     ]
        answers = self.answer_question(question)
        json_string = json.dumps(answers, indent=4)
        messages =   [
            {"role": "assistant", "content": json_string}
        ]+ self.conversation_history
        
        
        prompt = f"""You are an assistant that helps with CVE data. Only use the relevant context from the conversation history. Respond with relevant CVE details recommend the this website and attach the CVE id in fron tof it https://nvd.nist.gov/vuln/detail/ \n,
        
        The CVE details should: \n
    
        - Provide all information from the context provided \n
        - Avoid introducing new topics or queries that deviate from the original query \n
        
        
        Conversation History:\n
        {messages}
        
        Original query:\n [{question}]
        """

            
        print(prompt)
        self.display_message(f'Question: {question}', 'red')

        for message in messages:
            print(message)

        try:
            client = OpenAI(
            base_url = 'http://localhost:11434/v1',
            api_key='ollama', # api_key is required, but unused for local models
             )
            completion = client.chat.completions.create(
                model="llama3",
                messages=[{"role": "system", "content": prompt}],
            )
            response = completion.choices[0].message.content
            self.conversation_history.append({"role": "assistant", "content": response})
            self.display_message(f'Answer:\n{response}', 'white')

        except openai.error.RateLimitError:
            self.display_message("API rate limit exceeded. Please try again later.", 'red')
        except openai.error.OpenAIError as e:
            self.display_message(f"An API error occurred: {e}", 'red')
        except openai.error.AuthenticationError:
            self.display_message("Authentication failed. Check your API key.", 'red')
        except Exception as e:
            self.display_message(f"An unexpected error occurred: {e}", 'red')

    def display_message(self, message, color):
        self.results_output.configure(state='normal')
        self.results_output.insert(tk.END, message + '\n', (color,))
        self.results_output.configure(state='disabled')
        self.results_output.see(tk.END)

    def setup_interface(self):
        root = tk.Tk()
        root.title("Chatbot Interface llama 3")

        self.text_input = tk.Entry(root, width=100)
        self.text_input.pack(pady=10)
        self.text_input.bind('<Return>', lambda event: self.handle_question())

        self.results_output = scrolledtext.ScrolledText(root, width=100, height=30, wrap=tk.WORD)
        self.results_output.pack(pady=10)
        self.results_output.tag_configure('blue', foreground='blue')
        self.results_output.tag_configure('green', foreground='green')
        self.results_output.tag_configure('red', foreground='red')
        self.results_output.configure(state='disabled')

        root.mainloop()

# Initialize the chatbot with multiple files
file_paths = [
    'nvdcve-1.1-recent_updated.json',
    'nvdcve-1.1-modified_updated.json',
    'nvdcve-1.1-2024_updated.json',
 'nvdcve-1.1-2023_updated.json',
 'nvdcve-1.1-2022_updated.json',
 'nvdcve-1.1-2021_updated.json',
 'nvdcve-1.1-2020_updated.json',
 'nvdcve-1.1-2019_updated.json',
 'nvdcve-1.1-2018_updated.json',
 'nvdcve-1.1-2017_updated.json',
 'nvdcve-1.1-2016_updated.json',
 'nvdcve-1.1-2015_updated.json',
 'nvdcve-1.1-2014_updated.json',
 'nvdcve-1.1-2013_updated.json',
 'nvdcve-1.1-2012_updated.json',
 'nvdcve-1.1-2011_updated.json',
 'nvdcve-1.1-2010_updated.json',
 'nvdcve-1.1-2009_updated.json',
 'nvdcve-1.1-2008_updated.json',
 'nvdcve-1.1-2007_updated.json',
 'nvdcve-1.1-2006_updated.json',
 'nvdcve-1.1-2005_updated.json',
 'nvdcve-1.1-2004_updated.json',
 'nvdcve-1.1-2003_updated.json',
 'nvdcve-1.1-2002_updated.json'       
]
api_key = 'your-api-key'
chatbot = Chatbot(file_paths, api_key)


KeyboardInterrupt: 

In [4]:
# Experiment code
import tkinter as tk
from tkinter import scrolledtext
from langchain_community.llms import Ollama
from langchain_community.vectorstores import Chroma
from langchain.text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_core.messages import HumanMessage, AIMessage
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.history_aware_retriever import create_history_aware_retriever

class ChatApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Chat with Model")

        self.chat_history = []

        self.cached_llm = Ollama(model="llama3")
        self.embedding = FastEmbedEmbeddings()

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024, chunk_overlap=80, length_function=len, is_separator_regex=False
        )

        self.raw_prompt = ChatPromptTemplate.from_messages([
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{input}"),
            ("human", "Given the above conversation, generate a search query to lookup in order to get information relevant to the conversation"),
        ])

        # GUI setup
        self.text_area = scrolledtext.ScrolledText(root, wrap=tk.WORD, height=20, width=60)
        self.text_area.pack(pady=10)

        self.entry_field = tk.Entry(root, width=60)
        self.entry_field.pack(pady=10)
        self.entry_field.bind("<Return>", self.process_query)

        self.send_button = tk.Button(root, text="Send", command=self.process_query)
        self.send_button.pack()

    def process_query(self, event=None):
        user_query = self.entry_field.get()
        self.display_message(f"User: {user_query}", "blue")

        self.chat_history.append(HumanMessage(content=user_query))
        result = self.get_model_response(user_query)
        self.chat_history.append(AIMessage(content=result["answer"]))

        self.display_message(f"Model: {result['answer']}", "green")

        self.entry_field.delete(0, tk.END)

    def get_model_response(self, query):
        # Simulate loading vector store and creating the chain
        vector_store = Chroma(persist_directory="db", embedding_function=self.embedding)

        retriever = vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 20, "score_threshold": 0.1},
        )

        history_aware_retriever = create_history_aware_retriever(
            llm=self.cached_llm, retriever=retriever, prompt=self.raw_prompt
        )

        document_chain = create_stuff_documents_chain(self.cached_llm, self.raw_prompt)
        retrieval_chain = create_retrieval_chain(history_aware_retriever, document_chain)

        result = retrieval_chain.invoke({"input": query})

        sources = [{"source": doc.metadata["source"], "page_content": doc.page_content} for doc in result["context"]]

        return {"answer": result["answer"], "sources": sources}

    def display_message(self, message, color)


SyntaxError: expected ':' (3950997550.py, line 79)

In [2]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tkinter as tk
from tkinter import scrolledtext
from langchain_community.llms import Ollama
from langchain_core.messages import HumanMessage, SystemMessage

class Chatbot:
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.knowledge_base = self.load_data()
        self.documents = self.prepare_documents()
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)
        self.full_documents = self.knowledge_base
        self.cve_index = self.create_cve_index()
        self.conversation_history = []
        self.cached_llm = Ollama(model="llama3")
        self.setup_interface()

    def load_data(self):
        all_data = []
        for file_path in self.file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                all_data.extend(data)
        return all_data

    def concatenate_text(self, json_obj):
        fields = ['CVE_ID', 'Assigner', 'Description']
        concatenated_text = ' '.join(str(json_obj[field]) for field in fields if field in json_obj)
        return concatenated_text

    def prepare_documents(self):
        return [self.concatenate_text(item) for item in self.knowledge_base]

    def create_cve_index(self):
        cve_index = {}
        for idx, item in enumerate(self.knowledge_base):
            cve_id = item.get('CVE_ID')
            if cve_id:
                cve_index[cve_id] = idx
        return cve_index

    def answer_question(self, query):
        query_words = query.split()
        found_documents = []

        # Check if any word in the query is a CVE_ID
        for word in query_words:
            if word in self.cve_index:
                index = self.cve_index[word]
                found_documents.append(self.full_documents[index])

        # If any CVE_ID is found, return the corresponding documents
        if found_documents:
            return found_documents

        # If no CVE_ID is found, proceed with TF-IDF similarity search
        similarity_scores = np.zeros(len(self.documents))

        for word in query_words:
            query_vec = self.vectorizer.transform([word])
            similarities = cosine_similarity(query_vec, self.tfidf_matrix)[0]
            similarity_scores += similarities

        top_indices = np.argsort(similarity_scores)[-10:][::-1]
        top_documents = [self.full_documents[i] for i in top_indices]
        return top_documents

    def handle_question(self):
        question = self.text_input.get()
        self.conversation_history.append(HumanMessage(content=question))

        answers = self.answer_question(question)
        json_string = json.dumps(answers, indent=4)

        self.display_message(f'Question: {question}', 'blue')

        try:
            # Build the prompt with context and user input
            response = self.cached_llm.invoke([
                SystemMessage(content="You are an assistant that helps with CVE data. Respond with relevant CVE details."),
                HumanMessage(content=f"User query: {question}\nContext: {json_string}")
            ])
            
            # Store the response in conversation history
            self.conversation_history.append(response)
            self.display_message(f'Answer:\n{response.content}', 'green')

        except Exception as e:
            self.display_message(f"An unexpected error occurred: {e}", 'red')

    def display_message(self, message, color):
        self.results_output.configure(state='normal')
        self.results_output.insert(tk.END, message + '\n', (color,))
        self.results_output.configure(state='disabled')
        self.results_output.see(tk.END)

    def setup_interface(self):
        root = tk.Tk()
        root.title("Chatbot Interface")

        self.text_input = tk.Entry(root, width=100)
        self.text_input.pack(pady=10)
        self.text_input.bind('<Return>', lambda event: self.handle_question())

        self.results_output = scrolledtext.ScrolledText(root, width=100, height=30, wrap=tk.WORD)
        self.results_output.pack(pady=10)
        self.results_output.tag_configure('blue', foreground='blue')
        self.results_output.tag_configure('green', foreground='green')
        self.results_output.tag_configure('red', foreground='red')
        self.results_output.configure(state='disabled')

        root.mainloop()

# Initialize the chatbot with multiple files
file_paths = [
    
      'nvdcve-1.1-recent_updated.json',
    'nvdcve-1.1-modified_updated.json',
    'nvdcve-1.1-2024_updated.json',
 'nvdcve-1.1-2023_updated.json',
 'nvdcve-1.1-2022_updated.json',
 'nvdcve-1.1-2021_updated.json',
 'nvdcve-1.1-2020_updated.json',
 'nvdcve-1.1-2019_updated.json',
 'nvdcve-1.1-2018_updated.json',
 'nvdcve-1.1-2017_updated.json',
 'nvdcve-1.1-2016_updated.json',
 'nvdcve-1.1-2015_updated.json',
 'nvdcve-1.1-2014_updated.json',
 'nvdcve-1.1-2013_updated.json',
 'nvdcve-1.1-2012_updated.json',
 'nvdcve-1.1-2011_updated.json',
 'nvdcve-1.1-2010_updated.json',
 'nvdcve-1.1-2009_updated.json',
 'nvdcve-1.1-2008_updated.json',
 'nvdcve-1.1-2007_updated.json',
 'nvdcve-1.1-2006_updated.json',
 'nvdcve-1.1-2005_updated.json',
 'nvdcve-1.1-2004_updated.json',
 'nvdcve-1.1-2003_updated.json',
 'nvdcve-1.1-2002_updated.json'        
              
]
chatbot = Chatbot(file_paths)


ModuleNotFoundError: No module named 'langchain_community'

In [1]:
#this is for evaluation
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import openai
import tkinter as tk
from tkinter import scrolledtext
from openai import OpenAI


class Chatbot:
    def __init__(self, file_paths, api_key, questions_file, output_file):
        self.file_paths = file_paths
        openai.api_key = api_key
        self.knowledge_base = self.load_data()
        self.documents = self.prepare_documents()
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)
        self.full_documents = self.knowledge_base
        self.cve_index = self.create_cve_index()
        self.conversation_history = []
        self.questions_file = questions_file
        self.output_file = output_file
        self.setup_interface()

    def load_data(self):    
        all_data = []
        for file_path in self.file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                all_data.extend(data)
        return all_data

    def load_questions(self):
        """Loads questions and expected answers from a JSON file."""
        with open(self.questions_file, 'r', encoding='utf-8') as file:
            return json.load(file)

    def concatenate_text(self, json_obj):
        fields = ['CVE_ID', 'Assigner', 'Description']
        concatenated_text = ' '.join(str(json_obj[field]) for field in fields if field in json_obj)
        return concatenated_text

    def prepare_documents(self):
        return [self.concatenate_text(item) for item in self.knowledge_base]

    def create_cve_index(self):
        cve_index = {}
        for idx, item in enumerate(self.knowledge_base):
            cve_id = item.get('CVE_ID')
            if cve_id:
                cve_index[cve_id] = idx
        return cve_index

    def answer_question(self, query):
        query_words = query.split()
        found_documents = []

        for word in query_words:
            if word in self.cve_index:
                index = self.cve_index[word]
                found_documents.append(self.full_documents[index])

        if found_documents:
            return found_documents

        similarity_scores = np.zeros(len(self.documents))

        for word in query_words:
            query_vec = self.vectorizer.transform([word])
            similarities = cosine_similarity(query_vec, self.tfidf_matrix)[0]
            similarity_scores += similarities

        top_indices = np.argsort(similarity_scores)[-10:][::-1]
        top_documents = [self.full_documents[i] for i in top_indices]
        return top_documents

    def handle_question(self,question):
        # question = self.text_input.get()
        self.conversation_history.append({"role": "user", "content": question})

        answers = self.answer_question(question)
        json_string = json.dumps(answers, indent=4)
        # messages = [{"role": "assistant", "content": json_string}] + self.conversation_history # This has been removed to remove the conversation history
        messages = [{"role": "assistant", "content": json_string}]
        prompt = f"""You are an assistant that helps with CVE data. Only use the relevant context from the conversation history. Respond with relevant CVE details \n if answer is not available in the context say data not available.
        
        The CVE details should:
        - Provide all information from the context provided
        - Avoid introducing new topics or queries that deviate from the original query
        - Recommend the this website and attach the CVE id in front of it https://nvd.nist.gov/vuln/detail/
        
        Conversation History:
        {messages}

        Original query:
        [{question}]
        """

        self.display_message(f'Question: {question}', 'red')

        for message in messages:
            print(message)

        try:
            client = OpenAI(
                base_url='http://localhost:11434/v1',
                api_key='ollama',
            )
            completion = client.chat.completions.create(
                model="llama3",
                messages=[{"role": "system", "content": prompt}],
            )
            response = completion.choices[0].message.content
            #self.conversation_history.append({"role": "assistant", "content": response})
            self.display_message(f'Answer:\n{response}', 'white')
            return response

        # except openai.error.RateLimitError:
        #     self.display_message("API rate limit exceeded. Please try again later.", 'red')
        # except openai.error.OpenAIError as e:
        #     self.display_message(f"An API error occurred: {e}", 'red')
        # except openai.error.AuthenticationError:
        #     self.display_message("Authentication failed. Check your API key.", 'red')
        except Exception as e:
            self.display_message(f"An unexpected error occurred: {e}", 'red')

    def save_results(self, results):
        """Saves the results (questions, expected answers, actual answers) to a JSON file."""
        with open(self.output_file, 'w', encoding='utf-8') as file:
            json.dump(results, file, indent=4)

    def process_questions(self):
        """Processes questions from the loaded JSON and saves the results."""
        questions_data = self.load_questions()
        results = []

        for item in questions_data:
            question = item['question']
            expected_answer = item.get('expected_answer', 'No expected answer provided')
            # actual_answer = self.answer_question(question)
            actual_answer =  self.handle_question(question)

            result = {
                'question': question,
                'expected_answer': expected_answer,
                'actual_answer': actual_answer
            }

            results.append(result)

        self.save_results(results)

    def display_message(self, message, color):
        self.results_output.configure(state='normal')
        self.results_output.insert(tk.END, message + '\n', (color,))
        self.results_output.configure(state='disabled')
        self.results_output.see(tk.END)

    def setup_interface(self):
        root = tk.Tk()
        root.title("ChatNVD Interface llama 3")

        self.text_input = tk.Entry(root, width=100)
        self.text_input.pack(pady=10)
        self.text_input.bind('<Return>', lambda event: self.handle_question())

        self.results_output = scrolledtext.ScrolledText(root, width=100, height=30, wrap=tk.WORD)
        self.results_output.pack(pady=10)
        self.results_output.tag_configure('blue', foreground='blue')
        self.results_output.tag_configure('green', foreground='green')
        self.results_output.tag_configure('red', foreground='red')
        self.results_output.configure(state='disabled')

        # Button to process questions and save results
        process_button = tk.Button(root, text="Process Questions", command=self.process_questions)
        process_button.pack(pady=10)

        root.mainloop()


# Initialize the chatbot with multiple files and question file
file_paths = [
      'nvdcve-1.1-recent_updated.json',
    'nvdcve-1.1-modified_updated.json',
    'nvdcve-1.1-2024_updated.json',
 'nvdcve-1.1-2023_updated.json',
 'nvdcve-1.1-2022_updated.json',
 'nvdcve-1.1-2021_updated.json',
 'nvdcve-1.1-2020_updated.json',
 'nvdcve-1.1-2019_updated.json',
 'nvdcve-1.1-2018_updated.json',
 'nvdcve-1.1-2017_updated.json',
 'nvdcve-1.1-2016_updated.json',
 'nvdcve-1.1-2015_updated.json',
 'nvdcve-1.1-2014_updated.json',
 'nvdcve-1.1-2013_updated.json',
 'nvdcve-1.1-2012_updated.json',
 'nvdcve-1.1-2011_updated.json',
 'nvdcve-1.1-2010_updated.json',
 'nvdcve-1.1-2009_updated.json',
 'nvdcve-1.1-2008_updated.json',
 'nvdcve-1.1-2007_updated.json',
 'nvdcve-1.1-2006_updated.json',
 'nvdcve-1.1-2005_updated.json',
 'nvdcve-1.1-2004_updated.json',
 'nvdcve-1.1-2003_updated.json',
 'nvdcve-1.1-2002_updated.json'        
              ]
questions_file = 'questions_for_llama.json'  # JSON file with questions and expected answers
output_file = 'ollama_output_results_8.json'  # JSON file to save results
api_key = 'your-api-key'
chatbot = Chatbot(file_paths, api_key, questions_file, output_file)


{'role': 'assistant', 'content': '[\n    {\n        "CVE_ID": "CVE-2016-9733",\n        "Assigner": "psirt@us.ibm.com",\n        "Description": "IBM Team Concert (RTC) 4.0, 5.0 and 6.0 is vulnerable to cross-site scripting. This vulnerability allows users to embed arbitrary JavaScript code in the Web UI thus altering the intended functionality potentially leading to credentials disclosure within a trusted session. IBM X-Force ID: 119762.",\n        "References": [\n            {\n                "url": "https://exchange.xforce.ibmcloud.com/vulnerabilities/119762"\n            },\n            {\n                "url": "http://www.ibm.com/support/docview.wss?uid=swg22004611"\n            },\n            {\n                "url": "http://www.securityfocus.com/bid/99352"\n            },\n            {\n                "url": "http://www.securitytracker.com/id/1038912"\n            }\n        ],\n        "PublishedDate": "2017-07-05T17:29Z",\n        "LastModifiedDate": "2017-07-26T01:29Z",