In [1]:
import os
#from dotenv import load_dotenv
from pymongo import MongoClient
from transformers import pipeline
from datetime import datetime

# === Init ===
#load_dotenv()
#MONGO_URI = os.getenv("MONGO_URI")
client = MongoClient("mongodb+srv://rskissan:HZIXkw1D5XOUxaS2@osintunctruc.p5itk5s.mongodb.net/?retryWrites=true&w=majority")
db = client["osint_db"]

# === Load NER model ===
ner_pipe = pipeline("token-classification", model="bnsapa/cybersecurity-ner", grouped_entities=True)

# === Config ===
SOURCES = {
    "newsapi_data": "enriched_newsapi_data",
    "reddit_data": "enriched_reddit_data",
    "rss_data": "enriched_rss_data"
}

RELEVANT_LABELS = {"MALWARE", "ORG", "TOOL", "THREAT_ACTOR", "CVE", "VULNERABILITY"}
CYBER_KEYWORDS = [
    "malware", "ransomware", "phishing", "spyware", "keylogger", "trojan", "worm",
    "backdoor", "rootkit", "exploit", "vulnerability", "zero-day", "payload", "breach",
    "attack", "hack", "ddos", "sql injection", "bruteforce", "xss", "mitm",
    "botnet", "APT", "cve", "cisa", "fbi", "nsa", "threat actor", "indicator of compromise",
    "ioc", "dropper", "command and control", "c2 server", "esxi", "lockbit", "medusa",
    "data leak", "dark web", "cybercrime", "cyber attack", "cybersecurity", "infostealer",
    "exploit kit", "rce", "lpe", "privilege escalation"
]

def is_relevant(text):
    return any(keyword in text.lower() for keyword in CYBER_KEYWORDS)

def doc_already_processed(target_col, source_id):
    return target_col.find_one({"source_id": source_id}) is not None

# === Main Loop ===
for source_name, target_name in SOURCES.items():
    source_col = db[source_name]
    target_col = db[target_name]

    print(f"\n🔍 Processing source: {source_name}")

    for doc in source_col.find():
        text = doc.get("text", "")
        if not text.strip() or not is_relevant(text):
            continue

        source_id = doc.get("_id")
        if doc_already_processed(target_col, source_id):
            continue

        try:
            entities = ner_pipe(text)
            useful = [ent for ent in entities if ent["entity_group"].upper() in RELEVANT_LABELS]

            if len(useful) < 2:
                continue

            structured = {
                "original_text": text,
                "entities": [
                    {
                        "text": ent["word"],
                        "label": ent["entity_group"],
                        "score": float(round(ent["score"], 3))
                    } for ent in useful
                ],
                "entity_count": len(useful),
                "high_quality": True,
                "fetched_from": source_name.replace("_data", ""),
                "original_timestamp": doc.get("timestamp"),
                "processed_at": datetime.utcnow().isoformat(),
                "source_id": source_id  # ← to avoid reprocessing later
            }

            target_col.insert_one(structured)
            print(f"✅ {source_name} → saved doc with {len(useful)} entities")

        except Exception as e:
            print(f"❌ Error processing from {source_name}: {e}")


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cpu



🔍 Processing source: newsapi_data
✅ newsapi_data → saved doc with 4 entities
✅ newsapi_data → saved doc with 11 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 3 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 5 entities
✅ newsapi_data → saved doc with 3 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 3 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 4 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 2 entities
✅ newsapi_data → saved doc with 14 entities
✅ newsapi_data → saved doc with 9 entities
✅ newsapi_data → saved doc with 8 entities
✅ newsapi_data → saved doc with 3 entities
✅ newsapi_data → saved doc with 11 entities
✅ newsapi_data → saved doc with 4 entities
✅ newsapi_data → saved doc with 5 entities
✅ newsapi_data →

In [5]:
import json
import pandas as pd
from pymongo import MongoClient
from bson import ObjectId
from datetime import datetime
import numpy as np

# --- Mongo Connection ---
client = MongoClient("mongodb+srv://rskissan:HZIXkw1D5XOUxaS2@osintunctruc.p5itk5s.mongodb.net/?retryWrites=true&w=majority")
db = client["osint_db"]
col = db["enriched_newsapi_data"]  # ← Change to reddit/rss if needed

# --- Cleaner ---
def clean_mongo_doc(doc):
    cleaned = {}
    for k, v in doc.items():
        if isinstance(v, ObjectId):
            cleaned[k] = str(v)
        elif isinstance(v, datetime):
            cleaned[k] = v.isoformat()
        elif isinstance(v, (np.float32, np.float64)):
            cleaned[k] = float(v)
        elif isinstance(v, dict):
            cleaned[k] = clean_mongo_doc(v)
        elif isinstance(v, list):
            cleaned[k] = [clean_mongo_doc(i) if isinstance(i, dict) else i for i in v]
        else:
            cleaned[k] = v
    return cleaned

# --- Sample + Clean ---
sample = [clean_mongo_doc(doc) for doc in col.find().limit(50)]
df = pd.DataFrame(sample)

# --- Export ---
df.to_json("clean_enriched_sample.json", orient="records", lines=True)
print("✅ JSON file saved: clean_enriched_sample.json")


✅ JSON file saved: clean_enriched_sample.json
