In [None]:
import os
#from dotenv import load_dotenv
from pymongo import MongoClient
from transformers import pipeline
from datetime import datetime

# === Init ===
#load_dotenv()
#MONGO_URI = os.getenv("MONGO_URI")
client = MongoClient("mongodb+srv://rskissan:HZIXkw1D5XOUxaS2@osintunctruc.p5itk5s.mongodb.net/?retryWrites=true&w=majority")
db = client["osint_db"]

# === Load NER model ===
ner_pipe = pipeline("token-classification", model="bnsapa/cybersecurity-ner", grouped_entities=True)

# === Config ===
SOURCES = {
    "newsapi_data": "enriched_newsapi_data",
    "reddit_data": "enriched_reddit_data",
    "rss_data": "enriched_rss_data"
}

RELEVANT_LABELS = {"MALWARE", "ORG", "TOOL", "THREAT_ACTOR", "CVE", "VULNERABILITY"}
CYBER_KEYWORDS = [
    "malware", "ransomware", "phishing", "spyware", "keylogger", "trojan", "worm",
    "backdoor", "rootkit", "exploit", "vulnerability", "zero-day", "payload", "breach",
    "attack", "hack", "ddos", "sql injection", "bruteforce", "xss", "mitm",
    "botnet", "APT", "cve", "cisa", "fbi", "nsa", "threat actor", "indicator of compromise",
    "ioc", "dropper", "command and control", "c2 server", "esxi", "lockbit", "medusa",
    "data leak", "dark web", "cybercrime", "cyber attack", "cybersecurity", "infostealer",
    "exploit kit", "rce", "lpe", "privilege escalation"
]

def is_relevant(text):
    return any(keyword in text.lower() for keyword in CYBER_KEYWORDS)

def doc_already_processed(target_col, source_id):
    return target_col.find_one({"source_id": source_id}) is not None

# === Main Loop ===
for source_name, target_name in SOURCES.items():
    source_col = db[source_name]
    target_col = db[target_name]

    print(f"\n🔍 Processing source: {source_name}")

    for doc in source_col.find():
        text = doc.get("text", "")
        if not text.strip() or not is_relevant(text):
            continue

        source_id = doc.get("_id")
        if doc_already_processed(target_col, source_id):
            continue

        try:
            entities = ner_pipe(text)
            useful = [ent for ent in entities if ent["entity_group"].upper() in RELEVANT_LABELS]

            if len(useful) < 2:
                continue

            structured = {
                "original_text": text,
                "entities": [
                    {
                        "text": ent["word"],
                        "label": ent["entity_group"],
                        "score": float(round(ent["score"], 3))
                    } for ent in useful
                ],
                "entity_count": len(useful),
                "high_quality": True,
                "fetched_from": source_name.replace("_data", ""),
                "original_timestamp": doc.get("timestamp"),
                "processed_at": datetime.utcnow().isoformat(),
                "source_id": source_id  # ← to avoid reprocessing later
            }

            target_col.insert_one(structured)
            print(f"✅ {source_name} → saved doc with {len(useful)} entities")

        except Exception as e:
            print(f"❌ Error processing from {source_name}: {e}")
