In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score
)


In [2]:
# === BACA DATA ===
df = pd.read_csv("collected_etfposidf_x_w2v.csv")

print("=== Distribusi Label Asli ===")
print(df["label"].value_counts(), "\n")


=== Distribusi Label Asli ===
label
Synthesis        30
Knowledge        26
Evaluation       24
Comprehension    23
Analysis         23
Application      15
Name: count, dtype: int64 



In [3]:
# === SPLIT FITUR & LABEL ===
X = df.drop(columns=["soal", "label"]).values
y = df["label"]

In [4]:
# === SPLIT TRAIN-TEST ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [5]:
# === LATIH MODEL SVM DENGAN KERNEL RBF ===
model = SVC(kernel="rbf", random_state=42)
model.fit(X_train, y_train)

# Prediksi untuk data training
y_train_pred = model.predict(X_train)

# Prediksi untuk data testing 
y_test_pred = model.predict(X_test)

# Hitung F1-score untuk keduanya
f1_train = f1_score(y_train, y_train_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

print(f"F1-score (Train): {f1_train:.3f}")
print(f"F1-score (Test):  {f1_test:.3f}")


F1-score (Train): 0.954
F1-score (Test):  0.757


In [6]:
# === PREDIKSI ===
y_pred = model.predict(X_test)


In [7]:
# === HITUNG METRIK ===
accuracy = accuracy_score(y_test, y_pred)
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=3))

print(f"Accuracy: {accuracy:.3f}")
print(f"F1-score (weighted): {f1_weighted:.3f}")



=== Classification Report ===
               precision    recall  f1-score   support

     Analysis      0.600     0.600     0.600         5
  Application      0.000     0.000     0.000         3
Comprehension      1.000     0.800     0.889         5
   Evaluation      1.000     1.000     1.000         5
    Knowledge      1.000     1.000     1.000         5
    Synthesis      0.600     1.000     0.750         6

     accuracy                          0.793        29
    macro avg      0.700     0.733     0.706        29
 weighted avg      0.745     0.793     0.757        29

Accuracy: 0.793
F1-score (weighted): 0.757


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# === BUAT DATAFRAME HASIL PREDIKSI BENAR / SALAH ===
results_df = pd.DataFrame({
    "soal": df.loc[y_test.index, "soal"],
    "label_asli": y_test.values,
    "prediksi_svm": y_pred
})

# Tambahkan kolom status
results_df["status"] = results_df.apply(
    lambda row: "Benar" if row["label_asli"] == row["prediksi_svm"] else "Salah",
    axis=1
)

# Hitung ringkasan
jumlah_benar = (results_df["status"] == "Benar").sum()
jumlah_salah = (results_df["status"] == "Salah").sum()

summary_row = pd.DataFrame({
    "soal": ["--- Summary ---"],
    "label_asli": ["-"],
    "prediksi_svm": ["-"],
    "status": [
        f"Benar: {jumlah_benar}, Salah: {jumlah_salah}, "
        f"Akurasi: {accuracy:.3f}, F1-weighted: {f1_weighted:.3f}"
    ]
})

results_df = pd.concat([results_df, summary_row], ignore_index=True)


In [9]:
# === SIMPAN KE CSV ===
results_df.to_csv("hasil_rbf_collected_etfposidf_x_w2v.csv", index=False)

print("\nHasil prediksi SVM (RBF) beserta status dan summary telah disimpan.")



Hasil prediksi SVM (RBF) beserta status dan summary telah disimpan.
