In [None]:
# 02_Comparison_TFIDF_SBERT.ipynb

import pandas as pd
import pickle
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load judul
df = pd.read_csv('../data/cleaned_titles.csv')
titles = df['Judul'].tolist()

# Generate TF-IDF embedding
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(titles).toarray()

# Generate SBERT embedding
sbert = SentenceTransformer('paraphrase-mpnet-base-v2')
X_sbert = sbert.encode(titles)

# PCA visualisasi
def plot_pca(embedding, title):
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(embedding)
    plt.figure(figsize=(8,6))
    plt.scatter(reduced[:, 0], reduced[:, 1], alpha=0.5)
    plt.title(f"PCA: {title}")
    plt.show()

plot_pca(X_tfidf, "TF-IDF")
plot_pca(X_sbert, "SBERT")

# Top-K similarity example
from sklearn.metrics.pairwise import cosine_similarity

query = "Sistem keamanan berbasis kamera CCTV dan IoT"
query_embed = sbert.encode([query])
scores = cosine_similarity(query_embed, X_sbert)[0]
top_k = scores.argsort()[-5:][::-1]

print("Query:", query)
print("Top-5 Rekomendasi:")
for i in top_k:
    print("-", titles[i])
