In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score


In [12]:
df = pd.read_csv("collected_tfpos_x_w2v.csv")

print("=== Distribusi Label Asli ===")
print(df["label"].value_counts())


=== Distribusi Label Asli ===
label
Synthesis        30
Knowledge        26
Evaluation       24
Comprehension    23
Analysis         23
Application      15
Name: count, dtype: int64


In [13]:
X = df.drop(columns=["soal", "label"]).values
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [14]:
model = SVC(kernel="linear", C = 1.0, random_state=42)
model.fit(X_train, y_train)

# Prediksi untuk data training
y_train_pred = model.predict(X_train)

# Prediksi untuk data testing 
y_test_pred = model.predict(X_test)

# Hitung F1-score untuk keduanya
f1_train = f1_score(y_train, y_train_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

print(f"F1-score (Train): {f1_train:.3f}")
print(f"F1-score (Test):  {f1_test:.3f}")


F1-score (Train): 1.000
F1-score (Test):  0.848


In [15]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.3f}")
print(f"F1-score (weighted): {f1_weighted:.3f}")


Accuracy: 0.862
F1-score (weighted): 0.848


In [16]:
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))



=== Classification Report ===
               precision    recall  f1-score   support

     Analysis      0.800     0.800     0.800         5
  Application      1.000     0.333     0.500         3
Comprehension      1.000     1.000     1.000         5
   Evaluation      0.833     1.000     0.909         5
    Knowledge      0.833     1.000     0.909         5
    Synthesis      0.833     0.833     0.833         6

     accuracy                          0.862        29
    macro avg      0.883     0.828     0.825        29
 weighted avg      0.874     0.862     0.848        29



In [17]:
results_df = pd.DataFrame({
    "soal": df.loc[y_test.index, "soal"],
    "label_asli": y_test.values,
    "prediksi_svm": y_pred
})

results_df["status"] = results_df.apply(
    lambda row: "Benar" if row["label_asli"] == row["prediksi_svm"] else "Salah", axis=1
)


In [18]:
jumlah_benar = (results_df["status"] == "Benar").sum()
jumlah_salah = (results_df["status"] == "Salah").sum()

summary_row = pd.DataFrame({
    "soal": ["--- Summary ---"],
    "label_asli": ["-"],
    "prediksi_svm": ["-"],
    "status": [
        f"Benar: {jumlah_benar}, Salah: {jumlah_salah}, "
        f"Akurasi: {accuracy:.3f}, F1-weighted:  {f1_weighted:.3f}"
    ]
})

results_df = pd.concat([results_df, summary_row], ignore_index=True)

results_df.to_csv("hasil_collected_etfposidf_x_w2v_linear.csv", index=False)
print("Hasil prediksi beserta status dan summary sudah disimpan.")


Hasil prediksi beserta status dan summary sudah disimpan.
