# Threat Actor Knowledge Source Knowledge Building - ETDA Database
This notebook is heavily inspired by the excellent research of Roberto Rodriguez (@Cyb3rWard0g) into the applicability of generative AI for threat intelligence purposes. It follows the same structure with small alterations for this specific use case.

In specific, this notebook describes how to build a threat actor source knowledge vector database based on the ETDA threat actor database and its linked reports. We collect all the URLs and scrape the reports per threat actor. PDFs are currently not parsed. For data cleaning, we only remove some of the failed retrievals for now. We save these as markdown documents, after which we decompose the information into chunks and embed them in a Chroma vector database. 


References:

- https://blog.openthreatresearch.com/demystifying-generative-ai-a-security-researchers-notes/
- https://github.com/OTRF/GenAI-Security-Adventures
- https://github.com/OTRF/GenAI-Security-Adventures/blob/main/experiments/RAG/Threat-Intelligence/ATTCK-Groups/source-knowledge/notebook.ipynb
- https://python.langchain.com/docs/get_started/introduction
- https://apt.etda.or.th/cgi-bin/aptgroups.cgi

# Improvement ideas
- [ ] Clean scraped data from failed loads
- [ ] Clean scraped data from irrelevant context (other webpage content)
- [ ] Integrate PDF scraping and parsing
- [ ] Add other threat actor information sources
- [ ] Create summaries of reports for more condensed context
- [ ] Experiment with other chunk sizes

# Import modules

In [10]:
import os
import glob
import openai
import tiktoken
import tqdm as notebook_tqdm
import urllib.request, json
import copy
import hashlib
import nest_asyncio
import tqdm as notebook_tqdm
from jsonpath_ng import jsonpath, parse
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from jinja2 import Template
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.document_loaders import UnstructuredMarkdownLoader

# Define initial variables and OpenAI API key

In [15]:
load_dotenv()
# Get your key: https://platform.openai.com/account/api-keys
openai.api_key = os.getenv("OPENAI_API_KEY")
chroma_db = os.path.join(current_directory, "./knowledge/chroma_db")

current_directory = os.path.dirname("__file__")
documents_directory = os.path.join(current_directory, "documents")
reports_directory = os.path.join(documents_directory, "reports")
contrib_directory = os.path.join(current_directory, "contrib")
embeddings_directory = os.path.join(current_directory, "embeddings")
templates_directory = os.path.join(current_directory, "templates")
document_template = os.path.join(templates_directory, "document.md")
group_template = os.path.join(templates_directory, "group.md")

# Load ETDA threat actor database
Get JSON to laod reports per threat actor from. We distinguish between activity reports (operations) and information reports.

In [45]:
with urllib.request.urlopen("https://apt.etda.or.th/cgi-bin/getcard.cgi?g=all&o=j") as url:
    data = json.load(url)

# parse just threat actor activity reports, information references, and MITRE pages
activity_query = parse('$..activity')
information_query = parse('$..information[*]')

activity_data = activity_query.find(data)
information_data = information_query.find(data)

# split every activity report into an array of lines (split based on new line) to extract URL
activity_lines = [line.value.split('\n') for line in activity_data];

# parse just URLs from all the lines, which are last if included
activity_urls = [subarray[-1] for subarray in activity_lines]
information_urls = [url.value for url in information_data]

# dismiss any non-URLs and get web reports. we collect PDF reports seperately
information_urls_cleaned = [item for item in information_urls if not "pdf" in item]
information_pdfs = [item for item in information_urls if "pdf" in item]
activity_urls_cleaned = [item for item in activity_urls if "http" in item and not "pdf" in item]
activity_pdfs = [item for item in activity_urls if "http" in item and "pdf" in item]

print(f"loaded threat actor documents: {len(activity_urls_cleaned)} activity web reports")
print(f"loaded threat actor documents: {len(activity_pdfs)} activity pdf reports (unused)")
print(f"loaded threat actor documents: {len(information_urls_cleaned)} information references")
print(f"loaded threat actor documents: {len(information_pdfs)} information pdf references (unused)")

loaded threat actor documents: 2118 activity web reports
loaded threat actor documents: 109 activity pdf reports (unused)
loaded threat actor documents: 783 information references
loaded threat actor documents: 183 information pdf references (unused)


# Scrape reports
Use Langchain webbaseloader in multithreaded fashion.

In [47]:
# fixes a bug with asyncio and jupyter
nest_asyncio.apply()

all_urls = activity_urls_cleaned + information_urls_cleaned
loader = WebBaseLoader(all_urls, continue_on_failure=True)

# don't verify SSL connections
loader.requests_kwargs = {'verify':False}

loader.requests_per_second = 50

docs = loader.aload()
print(f"successfully loaded threat actor documents: {len(docs)} documents")

Fetching pages:   3%|##5                                                                                 | 87/2901 [00:13<08:08,  5.76it/s]Error fetching https://mycryptomag.com/2019/08/08/cryptocurrency-firms-are-targets-of-state-sponsored-hacking-group-from-china/ with attempt 1/3: Cannot connect to host mycryptomag.com:443 ssl:default [Name or service not known]. Retrying...
Fetching pages:   3%|##7                                                                                 | 96/2901 [00:18<11:36,  4.03it/s]Error fetching https://mycryptomag.com/2019/08/08/cryptocurrency-firms-are-targets-of-state-sponsored-hacking-group-from-china/ with attempt 2/3: Cannot connect to host mycryptomag.com:443 ssl:default [Name or service not known]. Retrying...
Fetching pages:   5%|####5                                                                              | 158/2901 [00:33<11:58,  3.82it/s]Error fetching https://mycryptomag.com/2019/08/08/cryptocurrency-firms-are-targets-of-state-sponsor

successfully loaded threat actor documents: 2901 documents


# Clean the data
Remove reports that are short in length.

In [12]:
# remove documents with almost no text
for index, doc in enumerate(docs):
    if (len(doc.page_content) < 100):
        print(f"removing document {index}")
        docs.pop(index)

print(f"leftover loaded threat actor documents after filtering: {len(docs)} documents")

NameError: name 'docs' is not defined

# Create threat actor Markdown docs

In [90]:
if not os.path.exists(documents_directory):
   print("[+] Creating knowledge directory..")
   os.makedirs(documents_directory)

if not os.path.exists(reports_directory):
   print("[+] Creating reports directory..")
   os.makedirs(reports_directory)

print("[+] Creating markdown files for each group..")
markdown_group_template = Template(open(group_template).read())
markdown_document_template = Template(open(document_template).read())
for group in data['values']:
    if not os.path.exists(os.path.join(reports_directory, group['actor'].replace('/','_'))):
       print("  [>>] Creating folder for {}..".format(group['actor']))
       os.makedirs(os.path.join(reports_directory, group['actor'].replace('/','_')))
        
    print("  [>>] Creating markdown files for {}..".format(group['actor']))
    group_for_render = copy.deepcopy(group)
    
    # create threat actor metadata document
    markdown_group = markdown_group_template.render(metadata=group_for_render)
    file_name = (group['actor']).replace(' ','_').replace('/','_')
    open(f'{reports_directory}/{group["actor"].replace("/","_")}/{file_name}.md', encoding='utf-8', mode='w').write(markdown_group)

    # create threat actor information documents
    if 'information' in group:
        for information in group['information']:
            matches = [copy.deepcopy(doc) for doc in docs if doc.metadata['source'] == information]
            for match in matches:
                markdown_information = markdown_document_template.render(metadata=match.metadata, group_name=group['actor'], page_content=match.page_content)
                file_name = (match.metadata['title']).replace(' ','_').replace('/','_')
                open(f'{reports_directory}/{group["actor"].replace("/","_")}/{file_name}.md', encoding='utf-8', mode='w').write(markdown_information)

    # create threat actor activity reports
    if 'operations' in group:
        for activity in group['operations']:
            matches = [copy.deepcopy(doc) for doc in docs if doc.metadata['source'] in activity['activity']]
            for index, match in enumerate(matches):
                markdown_information = markdown_document_template.render(metadata=match.metadata, group_name=group['actor'], page_content=match.page_content)
                if 'title' in match.metadata:
                    file_name = (match.metadata['title']).replace(' ','_').replace('/','_')
                else:
                    file_name = "Report " + str(index)
                open(f'{reports_directory}/{group["actor"].replace("/","_")}/{file_name}.md', encoding='utf-8', mode='w').write(markdown_information)

[+] Creating markdown files for each group..
  [>>] Creating markdown files for Big Panda..
  [>>] Creating markdown files for Boulder Bear..
  [>>] Creating markdown files for Clockwork Spider..
  [>>] Creating markdown files for Corsair Jackal..
  [>>] Creating markdown files for Dextorous Spider..
  [>>] Creating markdown files for Dizzy Panda..
  [>>] Creating markdown files for Electric Panda..
  [>>] Creating markdown files for Eloquent Panda..
  [>>] Creating markdown files for Flyfox..
  [>>] Creating markdown files for Foxy Panda..
  [>>] Creating markdown files for Ghost Jackal..
  [>>] Creating markdown files for Gibberish Panda..
  [>>] Creating markdown files for HolyWater..
  [>>] Creating markdown files for Impersonating Panda..
  [>>] Creating markdown files for Knockout Spider..
  [>>] Creating markdown files for Kumsong121..
  [>>] Creating markdown files for Magnetic Spider..
  [>>] Creating markdown files for Outlaw Spider..
  [>>] Creating markdown files for Overlo

# Load documents for indexing

In [16]:
report_files = glob.glob(os.path.join(reports_directory, "*/*.md"), recursive=True)

md_docs = []
print("[+] Loading group markdown files..")
for report in report_files:
    print(f' [*] Loading {os.path.basename(report)}')
    loader = UnstructuredMarkdownLoader(report)
    md_docs.extend(loader.load())

print(f'[+] Number of .md documents processed: {len(md_docs)}')

[+] Loading Group markdown files..
 [*] Loading 8220_Gang.md
 [*] Loading 8220_Gang_Cloud_Botnet_Targets_Misconfigured_Cloud_Workloads_-_SentinelOne.md
 [*] Loading 8220_Gang_Evolves_With_New_Strategies.md
 [*] Loading 8220_Gangs_Recent_use_of_Custom_Miner_and_Botnet.md
 [*] Loading From_the_Front_Lines_|_8220_Gang_Massively_Expands_Cloud_Botnet_to_30,000_Infected_Hosts_-_SentinelOne.md
 [*] Loading Imperva_Detects_Undocumented_8220_Gang_Activities_|_Imperva.md
 [*] Loading Radware_Page.md
 [*] Loading Achilles.md
 [*] Loading Another_Hacker_Selling_Access_to_Charity,_Antivirus_Firm_Networks.md
 [*] Loading Iranian_hackers_suspected_in_cyber_breach_and_extortion_attempt_on_Navy_shipbuilder_Austal_-_ABC_News.md
 [*] Loading AeroBlade.md
 [*] Loading AeroBlade_on_the_Hunt_Targeting_the_U.S._Aerospace_Industry.md
 [*] Loading APT_or_not_APT?_What's_Behind_the_Aggah_Campaign_-_Yoroi.md
 [*] Loading Aggah.md
 [*] Loading Aggah:_How_to_run_a_botnet_without_renting_a_Server_(for_more_than_a_y

KeyboardInterrupt: 

In [14]:
# check content of doc page
print(md_docs[0].page_content)

Threat actor: Big Panda

UUID: 4777aa1a-1bc4-4f81-8c1d-9c50c739d314

First seen:

Source last modified: 2020-04-19

Threat actor aliases

Big Panda (CrowdStrike)

Description

A threat actor mentioned in a summary report only, so we don't know who they are yet.

Sponsor type and motivation

Sponsor:

Motivation:

Country of origin

China

Observed attacked sectors where victims operate in

Observed attacked countries where victims operate in

Observed usage of tools

Reported hacking operations

Reported counter operations against threat actor


# Split documents
Check token counts of documents.

In [15]:
# check number of tokens before we create chunks with OpenAI tokenizer, as we will use their LLM as well

tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=() #To disable this check for all special tokens
    )
    return len(tokens)

# Get token counts
token_counts = [tiktoken_len(doc.page_content) for doc in md_docs]

print(f"""[+] Token Counts:
Min: {min(token_counts)}
Avg: {int(sum(token_counts) / len(token_counts))}
Max: {max(token_counts)}""")

[+] Token Counts:
Min: 46
Avg: 3041
Max: 45237


# Chunk text
Using Langchain text splitter.

In [16]:
print('[+] Initializing RecursiveCharacterTextSplitter..')
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)


[+] Initializing RecursiveCharacterTextSplitter..


In [17]:
print('[+] Splitting documents in chunks..')
chunks = text_splitter.split_documents(md_docs)

print(f'[+] Number of documents: {len(md_docs)}')
print(f'[+] Number of chunks: {len(chunks)}')

[+] Splitting documents in chunks..
[+] Number of documents: 2777
[+] Number of chunks: 23207


# Contribute dataset to .jsonl file (optional)
https://huggingface.co/datasets/TomTheAnalyst/ETDA-Threat-Actors

In [18]:
json_documents = []
m = hashlib.md5()
for doc in md_docs:
    doc_name = os.path.basename(doc.metadata['source'])
    m.update(doc_name.encode('utf-8'))
    uid = m.hexdigest()[:12]
    chunks_strings = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks_strings):
        # Add JSON object to array
        json_documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'source': doc_name
        })


In [19]:
print(f'[+] Exporting groups as .jsonl file..')
with open(f'{os.path.join(contrib_directory, "ETDA-threat-actors-index-reports.jsonl")}', 'w') as f:
    for doc in json_documents:
        f.write(json.dumps(doc) + '\n')

[+] Exporting groups as .jsonl file..


# Generate embeddings using open-source function

In [21]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

# load it into Chroma and save it to disk
db = Chroma.from_documents(chunks, embedding_function, collection_name="groups_collection", persist_directory="./chroma_db")

# Testing query

In [None]:
# query it
query = "What vulnerabilities has Cl0p exploited?"
relevant_docs = db.similarity_search(query)

# print results
print(len(relevant_docs))
print(relevant_docs[0].page_content)