In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from gensim.models import KeyedVectors

In [2]:
# === SETUP ===
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Marcell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# === LOAD DATASET ===
df = pd.read_csv("yahya_et_al_dataset.csv")
df = df.dropna(subset=["soal"])  # hapus baris kosong

In [4]:
# === PREPROCESSING ===
def preprocess_no_lowercase(text: str):
    # cleaning data
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    # tokenisasi 
    tokens = nltk.word_tokenize(text)
    # buang token kosong
    tokens = [t for t in tokens if t.strip()]
    return tokens

df["tokens_w2v"] = df["soal"].apply(preprocess_no_lowercase)

In [5]:
# === LOAD WORD2VEC ===
print("Loading GoogleNews Word2Vec...")
w2v = KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin", binary=True
)
print("Word2Vec loaded.")

Loading GoogleNews Word2Vec...
Word2Vec loaded.


In [6]:
# === FUNGSI: KONVERSI TOKEN â†’ VEKTOR (RATA-RATA) ===
EMB_DIM = w2v.vector_size  

def tokens_to_vec(tokens):
    vecs = []
    for w in tokens:
        if w in w2v.key_to_index:      # cek ada di vocab
            vecs.append(w2v[w])
    if not vecs:
        return np.zeros(EMB_DIM, dtype=np.float32)
    return np.mean(vecs, axis=0)

df["w2v_vector"] = df["tokens_w2v"].apply(tokens_to_vec)

In [7]:
# === UBAH LIST VEKTOR JADI DATAFRAME DIMENSI ===
w2v_vectors = pd.DataFrame(
    df["w2v_vector"].tolist(),
    columns=[f"dim_{i}" for i in range(EMB_DIM)]
)

In [8]:
# === GABUNGKAN DENGAN LABEL (JIKA ADA) ===
cols_exist = [c for c in ["soal", "label"] if c in df.columns]
final_df = pd.concat([df[cols_exist].reset_index(drop=True), w2v_vectors], axis=1)


In [9]:
# === SIMPAN HASIL ===
out_path = "yahya_word2vec.csv"
final_df.to_csv(out_path, index=False)
print("Selesai: fitur Word2Vec (tanpa lowercase) disimpan.")

# === INFORMASI HASIL ===
num_vectors = final_df.shape[0]          # jumlah baris/vektor dokumen
num_features = EMB_DIM                   # dimensi embedding (fitur)
total_cols = final_df.shape[1]           # total kolom dalam file keluaran

print(f"\nFile: '{out_path}'")
print(f"Jumlah vektor (baris dokumen): {num_vectors}")
print(f"Jumlah fitur (dimensi embedding): {num_features}")
print(f"Total kolom keseluruhan (termasuk soal & label bila ada): {total_cols}")


Selesai: fitur Word2Vec (tanpa lowercase) disimpan.

File: 'yahya_word2vec.csv'
Jumlah vektor (baris dokumen): 600
Jumlah fitur (dimensi embedding): 300
Total kolom keseluruhan (termasuk soal & label bila ada): 302
