In [1]:
import re
import numpy as np
import pandas as pd
import nltk, spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import normalize


In [2]:
# === SETUP ===
nltk.download("stopwords")
ps = PorterStemmer()
nlp = spacy.load("en_core_web_sm", disable=["ner"])  # POS saja cukup

# Stopwords dengan pengecualian
stop_words = set(stopwords.words("english"))
exceptions = {"your", "own", "how", "you"}
stop_words.difference_update(exceptions)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marcell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# === LOAD DATASET ===
df = pd.read_csv("collected_dataset.csv")  # kolom: 'soal' (+ 'label' opsional)
df = df.dropna(subset=["soal"]).reset_index(drop=True)

df.head()


Unnamed: 0,soal,label
0,List two reference parameters in the setHour f...,Knowledge
1,"""Explain briefly the meaning of the following ...",Knowledge
2,Label the parts of the diagram,Knowledge
3,"""Based on the above dataType class, list all t...",Knowledge
4,Define morphology,Knowledge


In [4]:
# === PREPROCESS ===
def preprocess_for_tfpos(text: str):
    # Normalization + lowercase
    clean = re.sub(r"[^A-Za-z\s]", " ", str(text)).lower()
    # Stopword removal selektif + buang non-alpha
    rough_tokens = [w for w in clean.split() if w.isalpha() and w not in stop_words]
    # POS tagging
    doc = nlp(" ".join(rough_tokens))
    # Stemming + POS
    stems, pos_tags = [], []
    for t in doc:
        if not t.is_alpha:
            continue
        stems.append(ps.stem(t.text))
        pos_tags.append(t.pos_)   # VERB, NOUN, ADJ, PROPN, dll.
    return stems, pos_tags

processed = df["soal"].apply(preprocess_for_tfpos)
df["stems"]    = processed.apply(lambda x: x[0])
df["pos_tags"] = processed.apply(lambda x: x[1])

df[["soal", "stems", "pos_tags"]].head()


Unnamed: 0,soal,stems,pos_tags
0,List two reference parameters in the setHour f...,"[list, two, refer, paramet, sethour, function]","[VERB, NUM, NOUN, NOUN, ADJ, NOUN]"
1,"""Explain briefly the meaning of the following ...","[explain, briefli, mean, follow, term, demogra...","[VERB, ADV, NOUN, VERB, NOUN, NOUN, NOUN, NOUN..."
2,Label the parts of the diagram,"[label, part, diagram]","[PROPN, NOUN, NOUN]"
3,"""Based on the above dataType class, list all t...","[base, datatyp, class, list, function, member]","[VERB, NOUN, NOUN, NOUN, NOUN, NOUN]"
4,Define morphology,"[defin, morpholog]","[VERB, NOUN]"


In [5]:
# === BANGUN VOCABULARY (berbasis STEM) ===
vocab = sorted(set(w for doc in df["stems"] for w in doc))
idx = {w: i for i, w in enumerate(vocab)}

len(vocab), vocab[:20]  # cek jumlah dan contoh vocab


(390,
 ['abstract',
  'accept',
  'accord',
  'action',
  'add',
  'address',
  'adopt',
  'advertis',
  'advic',
  'advis',
  'affect',
  'air',
  'along',
  'alpha',
  'altern',
  'analysi',
  'analyz',
  'anim',
  'anoth',
  'answer'])

In [6]:
# === IDF: 1 + log(D/df) (aman untuk df>0) ===
def compute_idf(list_of_docs):
    D = len(list_of_docs)
    df_count = {}
    for doc in list_of_docs:
        for w in set(doc):
            df_count[w] = df_count.get(w, 0) + 1
    return {
        w: 1.0 + np.log(D / df_count[w]) if df_count[w] > 0 else 1.0
        for w in df_count
    }

idf = compute_idf(df["stems"].tolist())

len(idf)


390

In [7]:
# === TFPOS-IDF ===
def tfposidf_vector(stems, pos_tags):
    # Bobot POS: VERB > NOUN/ADJ/PROPN > lainnya
    w1, w2, w3 = 5.0, 3.0, 1.0
    counts, total_w = {}, 0.0
    for stem, pos in zip(stems, pos_tags):
        if pos == "VERB":
            wp = w1
        elif pos in ("NOUN", "ADJ", "PROPN"):
            wp = w2
        else:
            wp = w3
        counts[stem] = counts.get(stem, 0.0) + wp
        total_w += wp

    vec = np.zeros(len(vocab), dtype=float)
    if total_w == 0:
        return vec

    for stem, cw in counts.items():
        j = idx.get(stem)
        if j is None:
            continue
        tfpos = cw / total_w           # normalisasi dalam-dokumen
        vec[j] = tfpos * idf.get(stem, 1.0)
    return vec

df["tfposidf_vec"] = [
    tfposidf_vector(stems, pos)
    for stems, pos in zip(df["stems"], df["pos_tags"])
]

df["tfposidf_vec"].head()


0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: tfposidf_vec, dtype: object

In [8]:
# === MATRIX & L2 NORMALIZATION (opsional) ===
X = np.vstack(df["tfposidf_vec"].values) if len(df) > 0 else np.zeros((0, len(vocab)))
X_norm = normalize(X, norm="l2") if X.shape[0] > 0 else X  # matikan jika tak ingin L2

X_norm.shape


(141, 390)

In [9]:
# === SIMPAN 1: kolom fitur = KATA ===
feat_words_df = pd.DataFrame(X_norm, columns=vocab)
meta_cols = [c for c in ["soal", "label"] if c in df.columns]
final_words_df = pd.concat([df[meta_cols].reset_index(drop=True), feat_words_df], axis=1)
final_words_df.to_csv("collected_tfpos_words.csv", index=False)

# === SIMPAN 2: kolom fitur = dim_i ===
feat_dims_df = pd.DataFrame(X_norm, columns=[f"dim_{i}" for i in range(len(vocab))])
final_dims_df = pd.concat([df[meta_cols].reset_index(drop=True), feat_dims_df], axis=1)
final_dims_df.to_csv("collected_tfpos_dims.csv", index=False)

with open("tfpos_vocab.txt", "w", encoding="utf-8") as f:
    for w in vocab:
        f.write(w + "\n")


In [10]:
# === INFORMASI HASIL ===
print("\nSelesai: TFPOS-IDF")
print(f"Dokumen (baris)         : {final_words_df.shape[0]}")
print(f"Jumlah fitur (|vocab|)   : {len(vocab)}")
print(f"Output fitur = kata      : collected_tfpos_words.csv")
print(f"Output fitur = dim_i     : collected_tfpos_dims.csv")
print(f"Kamus dim_i -> kata      : tfpos_vocab.txt")
print(f"Total kolom (words ver.) : {final_words_df.shape[1]} (termasuk meta: {meta_cols})")
print(f"Total kolom (dims ver.)  : {final_dims_df.shape[1]} (termasuk meta: {meta_cols})")



Selesai: TFPOS-IDF
Dokumen (baris)         : 141
Jumlah fitur (|vocab|)   : 390
Output fitur = kata      : collected_tfpos_words.csv
Output fitur = dim_i     : collected_tfpos_dims.csv
Kamus dim_i -> kata      : tfpos_vocab.txt
Total kolom (words ver.) : 392 (termasuk meta: ['soal', 'label'])
Total kolom (dims ver.)  : 392 (termasuk meta: ['soal', 'label'])
