<a href="https://colab.research.google.com/github/yenlung/AI-Demo/blob/master/%E3%80%90Demo06a%E3%80%91RAG01_%E6%89%93%E9%80%A0%E5%90%91%E9%87%8F%E8%B3%87%E6%96%99%E5%BA%AB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To Do list

- 透過 TrendMicro 的威脅情資報告與使用者端的情境建立 Embedding，讓 Chatbot 參考威脅情資對使用者的處境進行分析。並給出可能的攻擊面向或是應急處理方式。
  - 威脅情資的 Embedding 處理
  - Chatbot Prompting Stack
  - FrontEnd Output

# Build Vector Database

In [None]:
# Embedding
import pandas
import tqdm as notebook_tqdm

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document 
# Origin is "from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader", but i want to translate data via json. 

class ThreatIntelligenceEmbeddingModel() :
    def __init__(self, 
                 threat_intelligence_data_path : str, 
                 embedding_model : str,
                 chunk_size : int,
                 chunk_overlap : int,
                 vectorstore_path : str):
        # data 
        self.threat_intelligence_data_path = threat_intelligence_data_path
        self.threat_intelligence_embedding_db = "none" 
        self.vectorstore_path = vectorstore_path

        # embedding parameter
        self.embedding_model = embedding_model # "intfloat/multilingual-e5-small" 
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.pre_embedding_texts = []
        self.split_pre_embedding_texts = []
        self.vectorstore = "none"

    def embedding_execute(self) : 
        df = pandas.read_json(self.threat_intelligence_data_path)
        df = df.fillna("none")
        for index, row in df.iterrows() : 
            post_tile = row["title"]
            post_content = row["content"]
            pre_embedding_text = f"Title : {post_tile} \n Content : {post_content[0:300]}" # save money
            self.pre_embedding_texts.append(Document(page_content= pre_embedding_text))
        
        splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
        self.split_pre_embedding_texts = splitter.split_documents(self.pre_embedding_texts)
        self.vectorstore = FAISS.from_documents(self.split_pre_embedding_texts, HuggingFaceEmbeddings(model_name= self.embedding_model))
    
    def vector_store(self) : 
        self.vectorstore.save_local(self.vectorstore_path + "/faiss_db")

if __name__ == "__main__" : 
    embedding_process = ThreatIntelligenceEmbeddingModel(
                            threat_intelligence_data_path = "data/cybersecurity_intelligence.json",
                            embedding_model = "intfloat/multilingual-e5-small",
                            chunk_size = 500,
                            chunk_overlap = 100,
                            vectorstore_path = "data"
    )
    embedding_process.embedding_execute()
    embedding_process.vector_store()
    print("Emedding Completed")

# ChatBot Stack 

In [46]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
client = OpenAI(
    api_key = os.environ.get("OPENAI_API_KEY")
)

# cot prompt + RAG
system_prompt_templete = '''
    你是一個優秀的資安分析師，現在需要幫助使用者分辨當前的情境屬於何種資安威脅。並參考威脅情資內容後，根據以下內容 Step by Step 分析。
    參考情資與使用者情境分析，
    1. 使用者可能是如何被入侵的
    2. 使用者目前可以先做何種緩解措施
'''

# RAG Post
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")
vectorstore = FAISS.load_local(
    "data/faiss_db",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

messages = [
            {"role": "system", "content": system_prompt_templete}
            ]

import gradio
with gradio.Blocks() as demo:
    gradio.Markdown("# Professional Security Consulting")
    chatbot = gradio.Chatbot(type="messages")
    msg = gradio.Textbox(placeholder="請輸入你的問題...")
    state = gradio.State(messages)

    def main_chatbot(user_prompt, messages):
        results = vectorstore.similarity_search(user_prompt, k=1)
        RAG_Post = results[0].page_content
        user_prompt_templete = '''
            使用者情境
                {user_prompt}
            威脅情資  
                {RAG_Post}
        '''

        messages.append({"role": "user", "content": user_prompt_templete.format(user_prompt = user_prompt, RAG_Post= RAG_Post)})

        chat_completion = client.chat.completions.create(
            model= "gpt-4o-mini", # save money
            messages= messages,
            max_tokens= 500 # cot very expensive must be limit output token.
        )

        reply = chat_completion.choices[0].message.content
        messages.append({"role": "assistant", "content": reply}) # 透過添加歷史對話紀錄，變相讓 LLM 記得說了些什麼。改成這個寫法之後要 debug 也會比較方便。

        return "", messages, messages

    msg.submit(
            fn=main_chatbot,
            inputs=[msg, state],
            outputs=[msg, chatbot, state]
        )

demo.launch(share=True, debug=True)

* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


Keyboard interruption in main thread... closing server.




In [None]:
user_prompt = '''
    My desktop is showing a 24-hour countdown, demanding that I pay in Bitcoin to unlock it, or else my data will be stolen.

    I just discovered a strange folder placed in the root directory of my Windows system. It's named "ysytem32
    '''
