In [None]:
# Install necessary libraries
# pip install spacy gensim nltk

import spacy
import gensim
from gensim import corpora
import nltk
from nltk.corpus import stopwords

# Initialize spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Example threat intelligence feed
intelligence_feed = [
    "Advanced persistent threat (APT) group APT28 launched a spear-phishing attack against critical infrastructure.",
    "New vulnerability discovered in Apache Struts, CVE-2024-12345, is being actively exploited in the wild.",
    "DDoS attacks from a botnet using Mirai malware have been targeting cloud service providers in Asia."
]

# Preprocess text by removing stopwords
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    return [word for word in text.lower().split() if word not in stop_words]

# Preprocessing the feed
processed_feed = [preprocess_text(text) for text in intelligence_feed]

# Create a dictionary and corpus for topic modeling
dictionary = corpora.Dictionary(processed_feed)
corpus = [dictionary.doc2bow(text) for text in processed_feed]

# Use Latent Dirichlet Allocation (LDA) for topic modeling
lda_model = gensim.models.LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=10)

# Print the topics discovered by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# Named Entity Recognition (NER) to extract specific threat entities (APT groups, CVE identifiers)
def extract_threat_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Example of extracting threat-related entities
for text in intelligence_feed:
    print(extract_threat_entities(text))