In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('threat_intel_data.csv')  # Replace with actual dataset path
df['clean_text'] = df['text'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))

# TF-IDF transformation
tfidf = TfidfVectorizer(max_features=500)
X = tfidf.fit_transform(df['clean_text']).toarray()

# Dimensionality reduction for Isolation Forest
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

# Anomaly detection with Isolation Forest
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
df['anomaly'] = isolation_forest.fit_predict(X_pca)

# Visualize anomalies
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['anomaly'], palette={1: 'blue', -1: 'red'})
plt.title("Anomaly Detection in Threat Intelligence Data")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()