In [1]:
pip install sentence-transformers scikit-learn pandas matplotlib


Note: you may need to restart the kernel to use updated packages.


In [4]:
%matplotlib inline


In [5]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load your CSV
df = pd.read_csv(r"dataset\synthetic_logs.csv")
messages = df["message"].astype(str).tolist()

# -------- Method 1: TF-IDF Embeddings --------
tfidf_vectorizer = TfidfVectorizer()
tfidf_embeddings = tfidf_vectorizer.fit_transform(messages)

# -------- Method 2: SentenceTransformer Embeddings --------
model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_embeddings = model.encode(messages)

# -------- Clustering using DBSCAN --------
dbscan_tfidf = DBSCAN(eps=1.2, min_samples=3).fit(tfidf_embeddings)
dbscan_sentence = DBSCAN(eps=1.0, min_samples=3).fit(sentence_embeddings)

# Add cluster labels to the DataFrame
df['Cluster_TFIDF'] = dbscan_tfidf.labels_
df['Cluster_Transformer'] = dbscan_sentence.labels_

# -------- Optional: Plot Clusters (for visualization) --------
def plot_clusters(embeddings, labels, title):
    reduced = PCA(n_components=2).fit_transform(embeddings)
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap='tab10', s=15)
    plt.title(title)
    plt.colorbar(scatter)
    plt.show()

plot_clusters(tfidf_embeddings.toarray(), dbscan_tfidf.labels_, "TF-IDF Clustering")
plot_clusters(sentence_embeddings, dbscan_sentence.labels_, "SentenceTransformer Clustering")


KeyError: 'message'

In [3]:
plt.show()
