# TTI: Python For Defenders
## 3-5: Lab - IoC Extractor

In [1]:
import ipywidgets as widgets
from IPython.display import display
import json
import csv
import pdfplumber
import re

### Aquire Files

In [2]:
upload = widgets.FileUpload(multiple=True, accept=".csv, .txt, .json, .pdf, .html")
label = widgets.Label(value="Upload Files: ")
box = widgets.HBox([label, upload])
display(box)

HBox(children=(Label(value='Upload Files: '), FileUpload(value=(), accept='.csv, .txt, .json, .pdf, .html', de…

### Extract Content

In [3]:
def get_pdf_text(pdf_path: str) -> str:
    """
    Extracts text from file at pdf_path and returns a big ol' string of the results
    """
    with pdfplumber.open(pdf_path) as pdf:
        return "".join([p.extract_text() for p in pdf.pages])
    
def get_file_contents(filename: str) -> str:
    """
    Seeks the upload widget for a given filename.
    
    If it is there and it's not a PDF, grabs the contents as a string
    
    PDFs, it will use the filename with get_pdf_text
    """
    # Check for a PDF
    for file in upload.value:
        if file["name"] == filename and file["type"] == "application/pdf":
            return get_pdf_text(filename)
    
    # Otherwise get the contents
    for file in upload.value:
        if file["name"] == filename:
            return file["content"]

In [None]:
i = 0
while i < len(upload.value):
    data: dict = { upload.value[i]["name"]: get_file_contents(upload.value[i]["name"]) }
    i += 1

### Match Patterns

In [4]:
# IoC Regexes
md5_pattern = re.compile(r"(?<![0-9a-f])[0-9a-f]{32}(?![0-9a-f])")
sha1_pattern = re.compile(r"(?<![0-9a-f])[0-9a-f]{40}(?![0-9a-f])")
sha256_pattern = re.compile(r"(?<![0-9a-f])[0-9a-f]{64}(?![0-9a-f])")
sha512_pattern = re.compile(r"[0-9a-f]{128}")
ipv4_pattern = re.compile(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}")
domain_pattern = re.compile(r"(?:[A-Za-z0-9\-]+\.)+[A-Za-z]{2,}")
url_pattern = re.compile(r"https?://(?:[A-Za-z0-9\-]+\.)+[A-Za-z0-9]{2,}(?::\d{1,5})?[/A-Za-z0-9\-%?=\+\.]+")

In [None]:
results = {}

for d in data:
    content = data[d].tobytes().decode("utf-8")
    results[d] = {
        'md5': list(set(md5_pattern.findall(content))),
        'sha1': list(set(sha1_pattern.findall(content))),
        'sha256': list(set(sha256_pattern.findall(content))),
        'sha512': list(set(sha512_pattern.findall(content))),
        'ipv4': list(set(ipv4_pattern.findall(content))),
        'domain': list(set(domain_pattern.findall(content))),
        'url': list(set(url_pattern.findall(content)))
    }
    
results

### Deliver

In [5]:
output = widgets.Output()

json_button = widgets.Button(description="JSON Export")
csv_button = widgets.Button(description="CSV Export")

box = widgets.HBox([csv_button, json_button, output])
display(box)

# Define filenames
CSV_RESULTS: str = 'results.csv'
JSON_RESULTS: str = 'results.json'
    
def csv_export(b):
    header = ["filename", "type", "value"]
    with open(CSV_RESULTS, 'w') as f:
        writer = csv.writer()
        writer.writerow(header)
        for filename in results:
            result = results[filename]
            for ioc_type in result:
                iocs = result[ioc_type]
                rows = [[filename, ioc_type, i] for i in iocs]
                writer.writerows(rows)
    print(f"{CSV_RESULTS} written")
    
def json_export(b):
    with output:
        with open (JSON_RESULTS, 'w') as f:
            json.dump(results, f)
            print(f"{JSON_RESULTS} written")
            
csv_button.on_click(csv_export)
json_button.on_click(json_export)

HBox(children=(Button(description='CSV Export', style=ButtonStyle()), Button(description='JSON Export', style=…