In [1]:
import re
import json
import numpy as np
import pandas as pd
import spacy, nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize


In [2]:
# ===== SETUP =====
nltk.download("stopwords")

# load spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner"])  # POS & sents cukup

# Stopwords + pengecualian (dipertahankan)
stop_words = set(stopwords.words("english"))
exceptions = {"your", "own", "how", "you"}
stop_words.difference_update(exceptions)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marcell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# ===== BT VERB LIST =====
bt_verb_list = {
    "Knowledge": ["define","duplicate","list","memorize","recall","repeat","state","identify","recognize"],
    "Comprehension": ["classify","describe","discuss","explain","identify","locate","recognize","report","select","translate","summarize","interpret"],
    "Application": ["apply","choose","demonstrate","illustrate","interpret","operate","schedule","sketch","solve","use","implement"],
    "Analysis": ["analyze","compare","contrast","differentiate","discriminate","distinguish","examine","experiment","question","test","investigate"],
    "Evaluation": ["appraise","argue","assess","choose","defend","estimate","evaluate","judge","justify","rate","support","value","critique"],
    "Synthesis": ["assemble","construct","create","design","develop","formulate","write","invent","generate","plan","produce","compose"]
}
ALL_BT = set(v for vv in bt_verb_list.values() for v in vv)


In [4]:
# ===== LOAD DATASET =====
df = pd.read_csv("yahya_et_al_dataset.csv")
df = df.dropna(subset=["soal"]).reset_index(drop=True)

In [5]:
# ===== PIPELINE =====
def preprocess_pipeline(text: str, log=False):
    logs = []

    # (1) Cleaning + (2) Lowercase
    clean = re.sub(r"[^A-Za-z\s]", " ", str(text))
    lower = clean.lower()

    # (3) Tokenization (spaCy)  &  (4) POS Tagging
    doc = nlp(lower)
    if log:
        tokens_raw = [t.text for t in doc if t.is_alpha]
        logs.append(f"[tokenize] -> {tokens_raw}")
        logs.append(f"[pos] -> {[(t.text, t.pos_) for t in doc if t.is_alpha]}")

    # (5) Identification of BT Keyword (berbasis POSISI – sesuai paper)
    bt_flags = {}  # index token spaCy -> True/False
    for sent in doc.sents:
        for i, tok in enumerate(sent):
            if not tok.is_alpha:
                continue
            lemma = tok.lemma_.lower()
            prev_tok = sent[i-1] if i > 0 else None

            is_candidate = (
                i == 0 or
                (prev_tok is not None and (
                    prev_tok.lower_ == "and" or
                    (prev_tok.pos_ == "ADV" and prev_tok.text.endswith("ly"))
                ))
            )
            bt_flags[tok.i] = bool(is_candidate and (lemma in ALL_BT))

    if log:
        logs.append(f"[bt-ident] -> {[(t.text, bt_flags.get(t.i, False)) for t in doc if t.is_alpha]}")

    # (6) Stop Word Removal (setelah identifikasi, agar posisi BT tidak terganggu)
    kept = [t for t in doc if t.is_alpha and t.text not in stop_words]
    if log:
        logs.append(f"[stopwords] -> {[t.text for t in kept]} (exceptions={sorted(exceptions)})")

    # (7) Lemmatization (pada token yang dipertahankan)
    lemmas = [t.lemma_.lower() for t in kept]
    if log:
        logs.append(f"[lemmatize] -> {lemmas}")

    # — Feature set (Unigram) + Proposed Weighting (ETFPOS bagian POS) —
    weighted = []
    detailed = []
    for t in kept:
        lemma = t.lemma_.lower()
        is_bt = bt_flags.get(t.i, False)

        if t.pos_ == "VERB":
            w = 5.0 if is_bt else 3.0
        elif t.pos_ in ("NOUN", "PROPN", "ADJ"):
            w = 2.0
        else:
            w = 1.0

        weighted.append((lemma, w))
        if log:
            detailed.append((t.text, t.pos_, "BT" if is_bt else "non-BT", w))

    if log:
        logs.append(f"[pos-weight] -> {detailed}")
    return weighted, lemmas, logs


In [6]:
# ===== PROSES SEMUA DOKUMEN =====
weighted_docs = []
lemma_docs = []
for s in df["soal"]:
    w, lem, _ = preprocess_pipeline(s, log=False)
    weighted_docs.append(w)
    lemma_docs.append(lem)

df["weighted_tokens"] = weighted_docs
df["lemmas"] = lemma_docs


In [7]:
# ===== IDF (berdasarkan unigram lemmas) =====
vocab = sorted(set(tok for doc in df["lemmas"] for tok in doc))
idx = {w: i for i, w in enumerate(vocab)}

def compute_idf(list_of_docs):
    N = len(list_of_docs)
    df_count = {}
    for doc in list_of_docs:
        for w in set(doc):
            df_count[w] = df_count.get(w, 0) + 1
    # 1 + log((N+1)/(df+1)) stabil
    return {w: 1.0 + np.log((N + 1) / (df_count[w] + 1)) for w in df_count}

idf = compute_idf(df["lemmas"].tolist())


In [8]:
# ===== ETFPOS-IDF vektor per dokumen =====
def etfposidf_vector(weighted_tokens):
    # ETFPOS(t,q) ~ (c(t)*w_pos) / sum(c*w_pos)
    counts = {}
    total_w = 0.0
    for term, w in weighted_tokens:
        counts[term] = counts.get(term, 0.0) + w
        total_w += w

    vec = np.zeros(len(vocab), dtype=float)
    if total_w == 0:
        return vec

    for term, cw in counts.items():
        j = idx.get(term)
        if j is None:
            continue
        tfpos = cw / total_w                  # ETFPOS(t,q)
        vec[j] = tfpos * idf.get(term, 1.0)   # × IDF → ETFPOS-IDF
    return vec

df["vec"] = [etfposidf_vector(w) for w in df["weighted_tokens"]]


In [9]:
# ===== MATRIX + L2 NORMALIZATION (sesuai rumus paper) =====
X = np.vstack(df["vec"].values) if len(df) else np.zeros((0, len(vocab)))
X_norm = normalize(X, norm="l2") if X.shape[0] > 0 else X  # Normalized ETFPOS-IDF


In [10]:
# ===== SIMPAN =====
feat_df = pd.DataFrame(X_norm, columns=vocab)
meta_cols = [c for c in ["soal", "label"] if c in df.columns]
final_df = pd.concat([df[meta_cols].reset_index(drop=True), feat_df], axis=1)

out_path = "yahya_etfposidf.csv"
final_df.to_csv(out_path, index=False)

# ===== RINGKASAN =====
print("Selesai membentuk fitur ETFPOS-IDF.")
print(f"File: {out_path}")
print(f"Jumlah dokumen (baris): {final_df.shape[0]}")
print(f"Jumlah fitur (unigram): {len(vocab)}")
print(f"Total kolom (termasuk meta): {final_df.shape[1]}")


Selesai membentuk fitur ETFPOS-IDF.
File: yahya_etfposidf.csv
Jumlah dokumen (baris): 600
Jumlah fitur (unigram): 1436
Total kolom (termasuk meta): 1438
