In [1]:
import re
import numpy as np
import pandas as pd
from pathlib import Path
from gensim.models import KeyedVectors
import nltk, spacy
from nltk.stem import PorterStemmer

In [2]:
# ========== KONFIGURASI ==========
TFPOS_CSV = "collected_tfpos_words.csv"              # TFPOS-IDF (kolom fitur = stem, L2)
W2V_BIN    = "GoogleNews-vectors-negative300.bin"    # pretrained Word2Vec GoogleNews
OUT_CSV    = "collected_tfpos_x_w2v.csv"

In [3]:
# ========== SETUP ==========
nltk.download("punkt", quiet=True)
ps  = PorterStemmer()
nlp = spacy.load("en_core_web_sm", disable=["ner"])  # POS + lemma


In [4]:
# ========== LOAD TFPOS-IDF ==========
df = pd.read_csv(TFPOS_CSV)

meta_cols = [c for c in ["soal", "label"] if c in df.columns]
feat_cols = [c for c in df.columns if c not in meta_cols]
feat_set  = set(feat_cols)  # FIX: hanya izinkan ambil bobot dari kolom fitur

if not feat_cols:
    raise ValueError("Tidak ditemukan kolom fitur (STEM) pada CSV TFPOS-IDF.")
if "soal" not in df.columns:
    raise ValueError("CSV harus punya kolom 'soal' untuk membuat lemma (Path B).")

df.head()


Unnamed: 0,soal,label,abstract,accept,accord,action,add,address,adopt,advertis,...,well,wolf,women,word,work,would,write,x,you,your
0,List two reference parameters in the setHour f...,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"""Explain briefly the meaning of the following ...",Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Label the parts of the diagram,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"""Based on the above dataType class, list all t...",Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Define morphology,Knowledge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# ========== LOAD WORD2VEC ==========
print("Loading GoogleNews Word2Vec (.bin)...")
w2v = KeyedVectors.load_word2vec_format(W2V_BIN, binary=True)
EMB_DIM = w2v.vector_size
print(f"Word2Vec loaded. dim={EMB_DIM}")


Loading GoogleNews Word2Vec (.bin)...
Word2Vec loaded. dim=300


In [6]:
# ========== PREPROCESS UNTUK PATH B ==========
def preprocess_for_dualpath(text: str):
    """
    Return (stems_for_weights, lemmas_for_w2v).
      - stems_for_weights: lowercase-stem agar match kolom fitur (dibuat lowercase oleh pipeline TFPOS-IDF).
      - lemmas_for_w2v   : case-sensitive, TANPA lower(), agar cocok dgn GoogleNews.
    """
    clean = re.sub(r"[^A-Za-z\s]", " ", str(text))        # TANPA lower() utk W2V
    toks  = [w for w in clean.split() if w.isalpha()]     # tidak hapus stopword
    doc   = nlp(" ".join(toks))

    # FIX: stem pakai lower() agar match nama kolom fitur (dibuat dari lowercase di pipeline TFPOS-IDF)
    stems  = [ps.stem(t.text.lower()) for t in doc if t.is_alpha]   # untuk ambil bobot dari CSV
    lemmas = [t.lemma_                  for t in doc if t.is_alpha] # untuk lookup Word2Vec (case-sensitive)
    return stems, lemmas

# contoh cek 1 baris
stems_sample, lemmas_sample = preprocess_for_dualpath(df["soal"].iloc[0])
stems_sample[:10], lemmas_sample[:10]


(['list', 'two', 'refer', 'paramet', 'in', 'the', 'sethour', 'function'],
 ['list', 'two', 'reference', 'parameter', 'in', 'the', 'sethour', 'function'])

In [7]:
# ========== KOMBINASI ==========
final_mat = np.zeros((len(df), EMB_DIM), dtype=np.float32)

for i, row in df.iterrows():
    stems, lemmas = preprocess_for_dualpath(row["soal"])
    if not stems: 
        continue

    acc = np.zeros((EMB_DIM,), dtype=np.float32)

    # Kelompokkan lemma per stem
    stem2lemmas = {}
    for s, l in zip(stems, lemmas):
        if s in feat_set:
            stem2lemmas.setdefault(s, []).append(l)

    for s, Ls in stem2lemmas.items():
        w = float(row[s]) if pd.notna(row[s]) else 0.0
        if w == 0.0:
            continue

        # vektor representatif utk stem s = mean vektor lemma yang ditemukan
        vecs = [w2v[l] for l in Ls if l in w2v.key_to_index]
        if not vecs:
            continue
        stem_vec = np.mean(vecs, axis=0, dtype=np.float32)

        acc += w * stem_vec

    final_mat[i] = acc

final_mat.shape


(141, 300)

In [8]:
# ========== SIMPAN HASIL ==========
final_feat = pd.DataFrame(final_mat, columns=[f"final_dim_{i}" for i in range(EMB_DIM)])
out_df = pd.concat([df[meta_cols].reset_index(drop=True), final_feat], axis=1) if meta_cols else final_feat
out_df.to_csv(OUT_CSV, index=False)

out_df.head()


Unnamed: 0,soal,label,final_dim_0,final_dim_1,final_dim_2,final_dim_3,final_dim_4,final_dim_5,final_dim_6,final_dim_7,...,final_dim_290,final_dim_291,final_dim_292,final_dim_293,final_dim_294,final_dim_295,final_dim_296,final_dim_297,final_dim_298,final_dim_299
0,List two reference parameters in the setHour f...,Knowledge,-0.012698,0.024411,0.030172,0.150096,-0.164437,-0.17569,-0.002758,-0.186234,...,0.124501,0.274607,-0.103673,-0.02314,0.058509,-0.077336,-0.27481,0.019185,-0.271133,-0.033751
1,"""Explain briefly the meaning of the following ...",Knowledge,0.143909,-0.043262,-0.194532,0.069309,0.060251,-0.210685,0.357005,-0.34867,...,-0.321069,0.16524,-0.272739,-0.056932,0.166543,-0.240697,0.153198,-0.155883,0.30122,-0.0153
2,Label the parts of the diagram,Knowledge,0.006664,-0.151776,0.017053,0.148326,-0.264359,-0.020119,-0.082583,-0.210287,...,-0.236564,0.017802,-0.087913,-0.103051,-0.207167,-0.098627,-0.010768,-0.06784,-0.027426,-0.061461
3,"""Based on the above dataType class, list all t...",Knowledge,0.024061,0.080847,0.098576,0.151878,-0.420657,-0.193058,-0.074147,-0.35536,...,0.142138,0.001338,-0.266019,-0.140756,0.071725,0.031577,-0.161328,0.222009,0.022842,-0.100767
4,Define morphology,Knowledge,-0.08756,0.076639,-0.022108,0.142363,-0.283068,-0.185488,0.098833,-0.447847,...,-0.128331,0.250379,-0.013811,-0.175989,-0.307885,-0.048754,0.103221,-0.079569,0.167235,-0.136869


In [9]:
# ========== RINGKASAN ==========
print("\nSelesai: TFPOS-IDF × Word2Vec.")
print(f"Input TFPOS-IDF : {Path(TFPOS_CSV).resolve()}")
print(f"Model W2V (.bin): {Path(W2V_BIN).resolve()}")
print(f"Output          : {Path(OUT_CSV).resolve()}")
print(f"Jumlah dokumen  : {out_df.shape[0]}")
print(f"Dimensi vektor  : {EMB_DIM} (final_dim_*)")
print(f"Total kolom     : {out_df.shape[1]} (termasuk meta: {meta_cols})")



Selesai: TFPOS-IDF × Word2Vec.
Input TFPOS-IDF : C:\Users\Marcell\Documents\Skripsi\NEW PART\testing1 - Copy\collected_tfpos_words.csv
Model W2V (.bin): C:\Users\Marcell\Documents\Skripsi\NEW PART\testing1 - Copy\GoogleNews-vectors-negative300.bin
Output          : C:\Users\Marcell\Documents\Skripsi\NEW PART\testing1 - Copy\collected_tfpos_x_w2v.csv
Jumlah dokumen  : 141
Dimensi vektor  : 300 (final_dim_*)
Total kolom     : 302 (termasuk meta: ['soal', 'label'])
