In [10]:
import re
import numpy as np
import pandas as pd
from pathlib import Path
from gensim.models import KeyedVectors
import nltk, spacy
from nltk.stem import PorterStemmer

In [11]:
# ========== KONFIGURASI ==========
TFPOS_CSV = "yahya_tfpos_words.csv"              # TFPOS-IDF (kolom fitur = stem, L2)
W2V_BIN    = "GoogleNews-vectors-negative300.bin"    # pretrained Word2Vec GoogleNews
OUT_CSV    = "yahya_tfpos_words_x_w2v.csv"

In [12]:
# ========== SETUP ==========
nltk.download("punkt", quiet=True)
ps  = PorterStemmer()
nlp = spacy.load("en_core_web_sm", disable=["ner"])  # POS + lemma


In [13]:
# ========== LOAD TFPOS-IDF ==========
df = pd.read_csv(TFPOS_CSV)

meta_cols = [c for c in ["soal", "label"] if c in df.columns]
feat_cols = [c for c in df.columns if c not in meta_cols]
feat_set  = set(feat_cols)  # FIX: hanya izinkan ambil bobot dari kolom fitur

if not feat_cols:
    raise ValueError("Tidak ditemukan kolom fitur (STEM) pada CSV TFPOS-IDF.")
if "soal" not in df.columns:
    raise ValueError("CSV harus punya kolom 'soal' untuk membuat lemma (Path B).")

df.head()


Unnamed: 0,soal,label,abc,abl,abnorm,abolit,absolut,absorpt,abstract,accept,...,year,yemen,yield,you,young,your,z,zener,zone,zoo
0,About what proportion of the population of the...,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Correctly label the brain lobes indicated on t...,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Define compound interest.,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Define four types of traceability.,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Define mercantilism.,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# ========== LOAD WORD2VEC ==========
print("Loading GoogleNews Word2Vec (.bin)...")
w2v = KeyedVectors.load_word2vec_format(W2V_BIN, binary=True)
EMB_DIM = w2v.vector_size
print(f"Word2Vec loaded. dim={EMB_DIM}")


Loading GoogleNews Word2Vec (.bin)...
Word2Vec loaded. dim=300


In [15]:
# ========== PREPROCESS UNTUK PATH B ==========
def preprocess_for_dualpath(text: str):
    """
    Return (stems_for_weights, lemmas_for_w2v).
      - stems_for_weights: lowercase-stem agar match kolom fitur (dibuat lowercase oleh pipeline TFPOS-IDF).
      - lemmas_for_w2v   : case-sensitive, TANPA lower(), agar cocok dgn GoogleNews.
    """
    clean = re.sub(r"[^A-Za-z\s]", " ", str(text))        # TANPA lower() utk W2V
    toks  = [w for w in clean.split() if w.isalpha()]     # tidak hapus stopword
    doc   = nlp(" ".join(toks))

    # FIX: stem pakai lower() agar match nama kolom fitur (dibuat dari lowercase di pipeline TFPOS-IDF)
    stems  = [ps.stem(t.text.lower()) for t in doc if t.is_alpha]   # untuk ambil bobot dari CSV
    lemmas = [t.lemma_                  for t in doc if t.is_alpha] # untuk lookup Word2Vec (case-sensitive)
    return stems, lemmas

# contoh cek 1 baris
stems_sample, lemmas_sample = preprocess_for_dualpath(df["soal"].iloc[0])
stems_sample[:10], lemmas_sample[:10]


(['about', 'what', 'proport', 'of', 'the', 'popul', 'of', 'the', 'us', 'is'],
 ['about',
  'what',
  'proportion',
  'of',
  'the',
  'population',
  'of',
  'the',
  'US',
  'be'])

In [16]:
# ========== KOMBINASI ==========
final_mat = np.zeros((len(df), EMB_DIM), dtype=np.float32)

for i, row in df.iterrows():
    stems, lemmas = preprocess_for_dualpath(row["soal"])
    if not stems: 
        continue

    acc = np.zeros((EMB_DIM,), dtype=np.float32)

    # Kelompokkan lemma per stem
    stem2lemmas = {}
    for s, l in zip(stems, lemmas):
        if s in feat_set:
            stem2lemmas.setdefault(s, []).append(l)

    for s, Ls in stem2lemmas.items():
        w = float(row[s]) if pd.notna(row[s]) else 0.0
        if w == 0.0:
            continue

        # vektor representatif utk stem s = mean vektor lemma yang ditemukan
        vecs = [w2v[l] for l in Ls if l in w2v.key_to_index]
        if not vecs:
            continue
        stem_vec = np.mean(vecs, axis=0, dtype=np.float32)

        acc += w * stem_vec

    final_mat[i] = acc

final_mat.shape


(600, 300)

In [17]:
# ========== SIMPAN HASIL ==========
final_feat = pd.DataFrame(final_mat, columns=[f"final_dim_{i}" for i in range(EMB_DIM)])
out_df = pd.concat([df[meta_cols].reset_index(drop=True), final_feat], axis=1) if meta_cols else final_feat
out_df.to_csv(OUT_CSV, index=False)

out_df.head()


Unnamed: 0,soal,label,final_dim_0,final_dim_1,final_dim_2,final_dim_3,final_dim_4,final_dim_5,final_dim_6,final_dim_7,...,final_dim_290,final_dim_291,final_dim_292,final_dim_293,final_dim_294,final_dim_295,final_dim_296,final_dim_297,final_dim_298,final_dim_299
0,About what proportion of the population of the...,Knowledge,0.11191,-0.125665,0.040693,0.105511,-0.176248,-0.139479,-0.061115,-0.144234,...,-0.137788,0.098694,-0.104026,-0.031107,-0.016135,-0.029388,-0.206583,-0.038634,0.411892,-0.330857
1,Correctly label the brain lobes indicated on t...,Knowledge,-0.158814,-0.180729,-0.172508,0.141274,-0.416018,0.01752,0.031361,-0.209833,...,-0.020739,0.071669,0.164683,-0.180633,0.141969,-0.236496,0.076394,-0.346795,-0.039931,-0.070151
2,Define compound interest.,Knowledge,0.031325,0.070836,0.006008,0.139391,-0.099626,-0.057863,0.083762,-0.447282,...,-0.097697,0.089579,-0.111823,-0.21517,-0.273833,-0.020077,0.003684,0.077878,-0.117951,-0.143707
3,Define four types of traceability.,Knowledge,-0.212526,-0.098651,-0.058775,0.22912,-0.097411,0.052221,0.163444,-0.205331,...,-0.208911,0.110989,-0.158474,0.132877,-0.03269,-0.059411,0.26404,-0.18639,0.181119,-0.336466
4,Define mercantilism.,Knowledge,0.044627,-0.125948,-0.130819,0.465302,-0.083729,0.095738,0.093815,-0.059816,...,-0.240597,0.062159,0.134576,-0.151465,-0.182574,-0.230233,0.305019,0.159757,0.169673,-0.085027


In [18]:
# ========== RINGKASAN ==========
print("\nSelesai: TFPOS-IDF (STEM dari CSV) × Word2Vec (LEMMA, case-sensitive, tanpa stopword).")
print(f"Input TFPOS-IDF : {Path(TFPOS_CSV).resolve()}")
print(f"Model W2V (.bin): {Path(W2V_BIN).resolve()}")
print(f"Output          : {Path(OUT_CSV).resolve()}")
print(f"Jumlah dokumen  : {out_df.shape[0]}")
print(f"Dimensi vektor  : {EMB_DIM} (final_dim_*)")
print(f"Total kolom     : {out_df.shape[1]} (termasuk meta: {meta_cols})")



Selesai: TFPOS-IDF (STEM dari CSV) × Word2Vec (LEMMA, case-sensitive, tanpa stopword).
Input TFPOS-IDF : C:\Users\Marcell\Documents\Skripsi\NEW PART\testing1 - Copy\yahya_tfpos_words.csv
Model W2V (.bin): C:\Users\Marcell\Documents\Skripsi\NEW PART\testing1 - Copy\GoogleNews-vectors-negative300.bin
Output          : C:\Users\Marcell\Documents\Skripsi\NEW PART\testing1 - Copy\yahya_tfpos_words_x_w2v.csv
Jumlah dokumen  : 600
Dimensi vektor  : 300 (final_dim_*)
Total kolom     : 302 (termasuk meta: ['soal', 'label'])
