In [4]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors


In [5]:
# ========== KONFIGURASI ==========
ETFPOS_CSV = "collected_etfposidf.csv"                        # ETFPOS-IDF (kolom fitur = kata)
W2V_BIN    = "GoogleNews-vectors-negative300.bin"             # pretrained Word2Vec GoogleNews
OUT_CSV    = "collected_etfposidf_x_w2v.csv"                  # output vektor akhir (300 dim)


In [6]:
# ========== LOAD ETFPOS-IDF ==========
df = pd.read_csv(ETFPOS_CSV)

# kolom meta yang mungkin ada
meta_cols = [c for c in ["soal", "label"] if c in df.columns]
feat_cols = [c for c in df.columns if c not in meta_cols]

if not feat_cols:
    raise ValueError(
        "Tidak ditemukan kolom fitur di CSV ETFPOS-IDF. "
        "Pastikan kolom fitur berisi nama kata."
    )

# matriks bobot ETFPOS-IDF (diasumsikan SUDAH L2-normalized per dokumen saat dibuat)
X_etf = df[feat_cols].to_numpy(dtype=float)
print("Jumlah dokumen :", X_etf.shape[0])
print("Jumlah fitur   :", X_etf.shape[1])


Jumlah dokumen : 141
Jumlah fitur   : 399


In [7]:
# ========== LOAD WORD2VEC ==========
print("Loading GoogleNews Word2Vec...")
w2v = KeyedVectors.load_word2vec_format(W2V_BIN, binary=True)
EMB_DIM = w2v.vector_size
print("Word2Vec loaded. dim =", EMB_DIM)


Loading GoogleNews Word2Vec...
Word2Vec loaded. dim = 300


In [8]:
# ========== SIAPKAN TERM YANG ADA DI VOCAB W2V ==========
# ambil indeks kolom (term) yang ada di vocab W2V agar cepat
in_vocab_idx = [j for j, term in enumerate(feat_cols) if term in w2v.key_to_index]
print(f"Term dikenali W2V: {len(in_vocab_idx)} dari {len(feat_cols)} fitur")

if len(in_vocab_idx) > 0:
    # W_sub: bobot untuk term yang ada di W2V (n_docs, k)
    W_sub = X_etf[:, in_vocab_idx].astype(np.float32)
    print(W_sub.shape  )
    # T: embedding term yang sama (k, 300)
    # term yang dipakai: feat_cols[j]  <-- SAMA dengan kolom bobot
    T = np.vstack([w2v[feat_cols[j]] for j in in_vocab_idx]).astype(np.float32)
    print(T.shape)
    # kontribusi per-term: (n_docs,k,300) = (n_docs,k,1) * (1,k,300)
    contrib = W_sub[:, :, None] * T[None, :, :]

    # jumlahkan semua term -> (n_docs, 300)
    final_mat = contrib.sum(axis=1)



Term dikenali W2V: 390 dari 399 fitur
(141, 390)
(390, 300)


In [9]:
# ========== SUSUN & SIMPAN ==========
final_feat = pd.DataFrame(final_mat, columns=[f"final_dim_{i}" for i in range(EMB_DIM)])
out_df = pd.concat([df[meta_cols].reset_index(drop=True), final_feat], axis=1)
out_df.to_csv(OUT_CSV, index=False)

# ========== RINGKASAN ==========
print("\nSelesai: ETFPOS-IDF × Word2Vec (berbasis CSV, mengikuti rumus paper).")
print(f"Input ETFPOS-IDF : {ETFPOS_CSV}")
print(f"Output           : {OUT_CSV}")
print(f"Jumlah dokumen   : {out_df.shape[0]}")
print(f"Dimensi vektor   : {EMB_DIM} (final_dim_*)")
print(f"Total kolom      : {out_df.shape[1]} (termasuk meta: {meta_cols})")
print(f"Term dikenali W2V: {len(in_vocab_idx)} dari {len(feat_cols)} fitur")



Selesai: ETFPOS-IDF × Word2Vec (berbasis CSV, mengikuti rumus paper).
Input ETFPOS-IDF : collected_etfposidf.csv
Output           : collected_etfposidf_x_w2v.csv
Jumlah dokumen   : 141
Dimensi vektor   : 300 (final_dim_*)
Total kolom      : 302 (termasuk meta: ['soal', 'label'])
Term dikenali W2V: 390 dari 399 fitur


In [None]:
# ========== SUSUN & SIMPAN ==========
final_feat = pd.DataFrame(final_mat, columns=[f"final_dim_{i}" for i in range(EMB_DIM)])
out_df = pd.concat([df[meta_cols].reset_index(drop=True), final_feat], axis=1)
out_df.to_csv(OUT_CSV, index=False)

# ========== RINGKASAN ==========
print("\nSelesai: ETFPOS-IDF × Word2Vec.")
print(f"Input ETFPOS-IDF : {ETFPOS_CSV}")
print(f"Output           : {OUT_CSV}")
print(f"Jumlah dokumen   : {out_df.shape[0]}")
print(f"Dimensi vektor   : {EMB_DIM} (final_dim_*)")
print(f"Total kolom      : {out_df.shape[1]} (termasuk meta: {meta_cols})")
print(f"Term dikenali W2V: {len(in_vocab_idx)} dari {len(feat_cols)} fitur")



Selesai: ETFPOS-IDF × Word2Vec (berbasis CSV, mengikuti rumus paper).
Input ETFPOS-IDF : collected_etfposidf.csv
Output           : collected_etfposidf_x_w2v.csv
Jumlah dokumen   : 141
Dimensi vektor   : 300 (final_dim_*)
Total kolom      : 302 (termasuk meta: ['soal', 'label'])
Term dikenali W2V: 390 dari 399 fitur
