# MS Office Forensics

## Imports

In [60]:
from pathlib import Path
import json
import zipfile
from oletools.olevba import VBA_Parser
import shutil
import pytesseract
from PIL import Image, ImageOps, ImageFilter

MSOFFICE_DIR = Path('msoffice')
OUTPUTS = Path('outputs')
OUTPUT_DIR_1 = OUTPUTS / Path('macros')
OUTPUT_DIR_2 = OUTPUTS / Path('embedded_images')
OUTPUT_DIR_3 = OUTPUTS / Path('OCR')

OUTPUTS.mkdir(exist_ok=True)
OUTPUT_DIR_1.mkdir(exist_ok=True)
OUTPUT_DIR_2.mkdir(exist_ok=True)
OUTPUT_DIR_3.mkdir(exist_ok=True)

OCR_CONFIG = "--psm 3"

## Checking for Macros

In [61]:
def list_zip_contents(path):
    try:
        with zipfile.ZipFile(path, 'r') as z:
            return z.namelist()
    except zipfile.BadZipFile:
        return None

In [62]:
def detect_macros(path):
    info = {
        "file": str(path),
        "is_zip": False,
        "zip_members": [],
        "has_macros": False,
        "macro_streams": [],
        "macro_preview": []
    }

    zip_members = list_zip_contents(path)
    if zip_members is not None:
        info["is_zip"] = True
        info["zip_members"] = zip_members
    
    try:
        vb = VBA_Parser(str(path))
    except Exception as e:
        info["error"] = f"VBA_Parser error: {e}"
        return info
    
    try:
        if vb.detect_vba_macros():
            info["has_macros"] = True
            for (subfile, stream_path, vba_filename, vba_code) in vb.extract_all_macros():
                info["macro_streams"].append({
                    "subfile": subfile,
                    "stream_path": stream_path,
                    "vba_filename": vba_filename
                })
                preview = (vba_code or "")[:800]
                info["macro_preview"].append({
                    "stream_path": stream_path,
                    "vba_filename": vba_filename,
                    "preview": preview
                })
        else:
            info["has_macros"] = False

    except Exception as e:
        info["error_analysis"] = f"Error analyzing macros: {e}"
    finally:
        try:
            vb.close()
        except Exception:
            pass

    return info

In [63]:
results = []
files = sorted(MSOFFICE_DIR.glob("*.docx"))
if not files:
    print(".DOCX not encountered in", MSOFFICE_DIR)

for f in files:
    print(f"Analyzing: {f.name}")
    r = detect_macros(f)
    results.append(r)

out_json = OUTPUT_DIR_1 / "macros.json"
with open(out_json, "w", encoding="utf-8") as fh:
    json.dump(results, fh, indent=2, ensure_ascii=False)
    print(f"Results stored in {out_json}")

Analyzing: 1bdbb2a88fc9b48a4d29ae76aafadf16.docx
Analyzing: 240c5875a9ba744f6c61ff42a4d7d999.docx
Analyzing: 46b138cf8645b457b2a8c4ebc79e06f1.docx
Results stored in outputs/macros/macros.json


## Extracting Embedded Images

In [64]:
def extract_images(docx_path, dest_root):
    extracted = []
    dest_dir = dest_root / docx_path.stem
    dest_dir.mkdir(parents=True, exist_ok = True)

    try:
        with zipfile.ZipFile(docx_path, "r") as zf:
            for member in zf.namelist():
                if member.lower().startswith("word/media/"):
                    filename = Path(member).name
                    out_path = dest_dir / filename
                    with zf.open(member) as src, open(out_path, "wb") as dst:
                        shutil.copyfileobj(src, dst)
                    extracted.append(str(out_path))
    except zipfile.BadZipFile:
        print(f"{docx_path.name} is not a valid file.")
    return extracted

In [65]:
files = sorted(MSOFFICE_DIR.glob("*.docx"))
if not files:
    print(".DOCX not encountered in", MSOFFICE_DIR)

for f in files:
    print("Processing...")
    imgs = extract_images(f, OUTPUT_DIR_2)
    if imgs:
        print(f"Extracted {len(imgs)} images:")
        for img in imgs:
            print(f"{img}\n")
    else:
        print("No images found in word/media/")

print("Extraction complete.")


Processing...
Extracted 2 images:
outputs/embedded_images/1bdbb2a88fc9b48a4d29ae76aafadf16/image2.png

outputs/embedded_images/1bdbb2a88fc9b48a4d29ae76aafadf16/image1.png

Processing...
Extracted 4 images:
outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image3.wmf

outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image4.png

outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image2.wmf

outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image1.png

Processing...
Extracted 1 images:
outputs/embedded_images/46b138cf8645b457b2a8c4ebc79e06f1/image1.jpeg

Extraction complete.


## Applying OCR on Images

In [66]:
def preprocess_image(img_path):
    img = Image.open(img_path)
    img = ImageOps.exif_transpose(img)
    gray = img.convert("L")

    w, h = gray.size
    if max(w, h) < 1500:
        gray = gray.resize((w*2, h*2), Image.LANCZOS)

    gray = gray.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
    bw = gray.point(lambda x: 0 if x < 160 else 255, "1")
    return gray, bw

def perform_ocr(img_path):
    gray, bw = preprocess_image(img_path)
    text_gray = pytesseract.image_to_string(gray, config=OCR_CONFIG)
    text_bw = pytesseract.image_to_string(bw, config=OCR_CONFIG)

    return text_gray if len(text_gray) >= len(text_bw) else text_bw

In [None]:
results = {}
image_files = list(OUTPUT_DIR_2.rglob("*.*"))
image_files = [f for f in image_files if f.suffix.lower() in (".png", ".jpeg", ".wmf")]
if not image_files:
    print(f"No images in {OUTPUT_DIR_2}")

for img in image_files:
    print(f"Applying OCR: {img}")
    try:
        text = perform_ocr(img).strip()
        results[str(img)] = {"text": text}
    except Exception as e:
        print (f"Error in {img.name}: {e}")
        results[str(img)] = {"error": str(e)}

Applying OCR: outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image4.png


Applying OCR: outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image1.png
Applying OCR: outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image2.wmf
Error in image2.wmf: cannot find loader for this WMF file
Applying OCR: outputs/embedded_images/240c5875a9ba744f6c61ff42a4d7d999/image3.wmf
Error in image3.wmf: cannot find loader for this WMF file
Applying OCR: outputs/embedded_images/46b138cf8645b457b2a8c4ebc79e06f1/image1.jpeg
Applying OCR: outputs/embedded_images/1bdbb2a88fc9b48a4d29ae76aafadf16/image2.png
Applying OCR: outputs/embedded_images/1bdbb2a88fc9b48a4d29ae76aafadf16/image1.png

OCR completed Results in: outputs/OCR/ocr_results.json


In [72]:
out_file = OUTPUT_DIR_3 / "ocr_results.json"
with open(out_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\nOCR completed Results in: {out_file}")


OCR completed Results in: outputs/OCR/ocr_results.json
