In [9]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

# === Load CSV ===
df = pd.read_csv("complete_cleaned.csv")

# =================================
#  TF-IDF (pakai tokens)
# =================================
corpus_tokens = df["tokens"].astype(str).tolist()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus_tokens)

pickle.dump(vectorizer, open("model/tfidf_vectorizer.pkl", "wb"))
pickle.dump(tfidf_matrix, open("model/tfidf_matrix.pkl", "wb"))

# =================================
#  BM25 (pakai tokens yg sama)
# =================================
tokenized_corpus = [doc.split() for doc in corpus_tokens]

bm25 = BM25Okapi(tokenized_corpus)
pickle.dump(bm25, open("model/bm25.pkl", "wb"))

# Simpan data asli
pickle.dump(df, open("model/news_data.pkl", "wb"))

print("Semua model berhasil dibuat!")


Semua model berhasil dibuat!


In [10]:
import pandas as pd

df = pd.read_csv("complete_cleaned.csv")

# Lihat beberapa contoh tokens
print("=== Sample tokens ===")
print(df["tokens"].head(3))
print()

# Cek apakah ada tokens yang kosong
print("=== Cek tokens kosong ===")
print(f"Total rows: {len(df)}")
print(f"Empty tokens: {df['tokens'].isna().sum()}")
print(f"Blank tokens: {(df['tokens'] == '').sum()}")

=== Sample tokens ===
0    ['jakarta', 'kompas', 'com', 'tentara', 'nasio...
1    ['jakarta', 'kompas', 'com', 'kepala', 'dinas'...
2    ['jakarta', 'kompas', 'com', 'panglima', 'tni'...
Name: tokens, dtype: object

=== Cek tokens kosong ===
Total rows: 1980
Empty tokens: 0
Blank tokens: 0


In [15]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import ast  # ← PENTING!

# Load CSV
df = pd.read_csv("complete_cleaned.csv")

# =================================
#  KONVERSI STRING → LIST
# =================================
# Kolom tokens adalah string representation dari list
# Contoh: "['jakarta', 'kompas', 'com']"
# Kita perlu convert ke list Python yang sebenarnya

def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except:
        # Jika gagal, coba split biasa
        return val.split() if isinstance(val, str) else []

df['tokens'] = df['tokens'].apply(safe_literal_eval)

# Debug: Cek konversi
print("=== After conversion ===")
print(f"Type: {type(df['tokens'].iloc[0])}")
print(f"Sample: {df['tokens'].iloc[0][:10]}")
print()

# =================================
#  TF-IDF (join list jadi string)
# =================================
corpus_tokens_str = df["tokens"].apply(lambda x: ' '.join(x) if isinstance(x, list) else x).tolist()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus_tokens_str)

pickle.dump(vectorizer, open("model/tfidf_vectorizer.pkl", "wb"))
pickle.dump(tfidf_matrix, open("model/tfidf_matrix.pkl", "wb"))

# =================================
#  BM25 (langsung pakai list)
# =================================
tokenized_corpus = df['tokens'].tolist()  # Sudah list, tidak perlu split!

# Debug
print("=== Tokenized corpus check ===")
print(f"Type: {type(tokenized_corpus[0])}")
print(f"Sample: {tokenized_corpus[0][:10]}")
print(f"Length: {len(tokenized_corpus[0])}")
print()

bm25 = BM25Okapi(tokenized_corpus)
pickle.dump(bm25, open("model/bm25.pkl", "wb"))

# Simpan data
pickle.dump(df, open("model/news_data.pkl", "wb"))

print("✅ Model berhasil dibuat!")

# =================================
#  TEST LANGSUNG
# =================================
print("\n=== Quick Test ===")
test_query = ['donald', 'trump', 'gaza']
scores = bm25.get_scores(test_query)
print(f"Query: {test_query}")
print(f"Max score: {scores.max()}")
print(f"Non-zero scores: {(scores > 0).sum()}")

top_idx = scores.argsort()[::-1][:3]
for i, idx in enumerate(top_idx):
    print(f"\n{i+1}. Score: {scores[idx]:.4f}")
    print(f"   Title: {df.loc[idx, 'title']}")

=== After conversion ===
Type: <class 'list'>
Sample: ['jakarta', 'kompas', 'com', 'tentara', 'nasional', 'indonesia', 'tni', 'angkatan', 'darat', 'ad']

=== Tokenized corpus check ===
Type: <class 'list'>
Sample: ['jakarta', 'kompas', 'com', 'tentara', 'nasional', 'indonesia', 'tni', 'angkatan', 'darat', 'ad']
Length: 296

✅ Model berhasil dibuat!

=== Quick Test ===
Query: ['donald', 'trump', 'gaza']
Max score: 6.467162573812388
Non-zero scores: 1943

1. Score: 6.4672
   Title: [POPULER GLOBAL] Trump Ingin Membeli Gaza | Tak Ada yang Bisa Mengusir Warga Palestina

2. Score: 6.3288
   Title: Apa yang Dibicarakan Prabowo dan Trump di KTT Gaza yang Terekam Mikrofon?

3. Score: 6.3190
   Title: Serba-serbi Prabowo "Si Pria Tangguh" Saksikan Trump Teken Perdamaian Gaza
