#  📌 **Loading and vectorizing the text dataset**

In [None]:
from time import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6

def load_wbl_dataset(filepath, verbose=False):
    """
    Load and vectorize the WBL mentee-mentor matching dataset.
    
    Param:
        filepath (str): Path ke CSV file data kamu.
        verbose (bool): Untuk menampilkan informasi proses.

    Return:
        X (TF-IDF matrix), y (labels), feature_names (list of terms), target_names (target labels), df (dataframe asli)
    """
    # 1. Load data kamu dari CSV
    df = pd.read_csv(filepath)

    # 2. Gabungkan semua kolom teks relevan jadi satu kolom 'combined_text'
    text_cols = [
        'spv_mentee', 'job_position', 'required_tools', 'required_skills', 
        'required_role_title', 'mentee_name', 'mentee_title', 'mentee_skill',
        'mentee_tools', 'mentee_position'
    ]
    df.fillna('', inplace=True)  # Hindari NaN
    df['combined_text'] = df[text_cols].agg(' '.join, axis=1)

    # 3. Target labels (misalnya 'mentee_status' untuk label target)
    y = df['mentee_status']
    target_names = y.unique()  # Mendapatkan nama kategori target (misalnya status mentee)
    
    # 4. TF-IDF vectorization
    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(df['combined_text'])
    feature_names = vectorizer.get_feature_names_out()
    duration = time() - t0

    if verbose:
        print(f"Dokumen: {len(df)}, Fitur: {X.shape[1]}")
        print(f"TF-IDF selesai dalam {duration:.2f} detik")

    return X, y, feature_names, target_names, df

# 2. Load dan split data
X, y, feature_names, target_names, df_raw = load_wbl_dataset("/Users/sonya/Downloads/JobMatching/transformed_src/df_lowercased.csv", verbose=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Train classifier
clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

# 4. Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, pred, ax=ax)
ax.xaxis.set_ticklabels(target_names, rotation=45)
ax.yaxis.set_ticklabels(target_names)
_ = ax.set_title(f"Confusion Matrix for {clf.__class__.__name__}\non WBL Job Matching")

# 5. Plot top keywords per class
def plot_feature_effects():
    avg_feature_effects = clf.coef_ * np.asarray(X_train.mean(axis=0)).ravel()
    top = pd.DataFrame()
    top_indices = []

    # Jika hanya satu kelas (binary)
    if len(clf.classes_) == 2 and len(avg_feature_effects.shape) == 1:
        top5 = np.argsort(avg_feature_effects)[-5:][::-1]
        top[clf.classes_[1]] = feature_names[top5]  # Gunakan positive class
        top_indices.extend(top5)
        avg_feature_effects = avg_feature_effects.reshape(1, -1)
    else:
        for i, label in enumerate(clf.classes_):
            top5 = np.argsort(avg_feature_effects[i])[-5:][::-1]
            top[label] = feature_names[top5]
            top_indices.extend(top5)

    top_indices = np.unique(top_indices)
    predictive_words = feature_names[top_indices]

    # Visualisasi bar horizontal
    bar_size = 0.25
    padding = 0.75
    y_locs = np.arange(len(top_indices)) * (4 * bar_size + padding)

    fig, ax = plt.subplots(figsize=(10, 8))
    for i, label in enumerate(clf.classes_):
        ax.barh(
            y_locs + (i - 2) * bar_size,
            avg_feature_effects[i, top_indices] if avg_feature_effects.ndim > 1 else avg_feature_effects[top_indices],
            height=bar_size,
            label=label,
        )
    ax.set(
        yticks=y_locs,
        yticklabels=predictive_words,
        ylim=[0 - 4 * bar_size, len(top_indices) * (4 * bar_size + padding) - 4 * bar_size],
    )
    ax.legend(loc="lower right")

    print("Top 5 keywords per class:")
    print(top)

    return ax




In [None]:
print(df_raw.columns)


In [None]:
df_raw


In [None]:
print("Label di training:", pd.Series(y_train).value_counts())
print("Label di testing:", pd.Series(y_test).value_counts())


# Analysis of a bag-of-words document classifier

## Model without metadata stripping

In [None]:
#3. Train classifier
clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

# 4. Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, pred, ax=ax)
ax.xaxis.set_ticklabels(target_names, rotation=45)
ax.yaxis.set_ticklabels(target_names)
_ = ax.set_title(f"Confusion Matrix for {clf.__class__.__name__}\non WBL Job Matching")

# 5. Plot top keywords per class
def plot_feature_effects():
    average_feature_effects = clf.coef_ * np.asarray(X_train.mean(axis=0)).ravel()

    top_indices = []
    top = pd.DataFrame()

    # Cek apakah average_feature_effects berdimensi 2 (multiclass) atau 1 (binary)
    if average_feature_effects.ndim == 2:
        # Multiclass case
        for i, label in enumerate(clf.classes_):
            top5 = np.argsort(average_feature_effects[i])[-5:][::-1]
            top[label] = [feature_names[i] for i in top5]
            top_indices.extend(top5)
        top_indices = np.unique(top_indices)
        predictive_words = [feature_names[i] for i in top_indices]

        fig, ax = plt.subplots(figsize=(10, 8))
        bar_size = 0.25
        padding = 0.75
        y_locs = np.arange(len(top_indices)) * (4 * bar_size + padding)

        for i, label in enumerate(clf.classes_):
            ax.barh(
                y_locs + (i - 2) * bar_size,
                average_feature_effects[i][top_indices],
                height=bar_size,
                label=str(label),
            )
    else:
        # Binary case
        top5 = np.argsort(average_feature_effects)[-5:][::-1]
        top["Top Features"] = [feature_names[i] for i in top5]
        top_indices = top5
        predictive_words = [feature_names[i] for i in top_indices]

        fig, ax = plt.subplots(figsize=(10, 6))
        bar_size = 0.35
        y_locs = np.arange(len(top_indices))

        ax.barh(
            y_locs,
            average_feature_effects[top_indices],
            height=bar_size,
            label="Average Feature Effect",
        )

    ax.set(
        yticks=y_locs,
        yticklabels=predictive_words,
        ylim=[-1, len(top_indices)],
    )
    ax.legend(loc="lower right")

    print("Top 5 keywords per class:")
    print(top)

    return ax



Model with metadata stripping

In [None]:

# 2. Train model
clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

# 3. Confusion matrix
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test, pred, ax=ax)
ax.xaxis.set_ticklabels(target_names, rotation=45)
ax.yaxis.set_ticklabels(target_names)
_ = ax.set_title(f"Confusion Matrix for {clf.__class__.__name__}\non filtered metadata WBL documents")

# 4. Feature effects
def plot_feature_effects():
    average_feature_effects = clf.coef_ * np.asarray(X_train.mean(axis=0)).ravel()

    top_indices = []
    top = pd.DataFrame()

    # Cek apakah average_feature_effects adalah 2D atau 1D
    if len(average_feature_effects.shape) == 2:
        # Multiclass atau binary dengan coef_ shape = (2, n_features)
        for i, label in enumerate(clf.classes_):
            top5 = np.argsort(average_feature_effects[i])[-5:][::-1]
            top[label] = [feature_names[j] for j in top5]
            top_indices.extend(top5)

        top_indices = np.unique(top_indices)
        predictive_words = [feature_names[j] for j in top_indices]

        fig, ax = plt.subplots(figsize=(10, 8))
        bar_size = 0.25
        padding = 0.75
        y_locs = np.arange(len(top_indices)) * (4 * bar_size + padding)

        for i, label in enumerate(clf.classes_):
            ax.barh(
                y_locs + (i - 2) * bar_size,
                average_feature_effects[i][top_indices],
                height=bar_size,
                label=label,
            )

    else:
        # Binary classifier with coef_ shape = (n_features,)
        top5 = np.argsort(average_feature_effects)[-5:][::-1]
        top["Top Features"] = [feature_names[j] for j in top5]
        top_indices = top5
        predictive_words = [feature_names[j] for j in top_indices]

        fig, ax = plt.subplots(figsize=(10, 6))
        bar_size = 0.35
        y_locs = np.arange(len(top_indices))

        ax.barh(
            y_locs,
            average_feature_effects[top_indices],
            height=bar_size,
            label="Feature Importance",
        )

    ax.set(
        yticks=y_locs,
        yticklabels=predictive_words,
        ylim=[-1, len(top_indices)],
    )
    ax.legend(loc="lower right")

    print("Top 5 keywords per class:")
    print(top)

    return ax




# Benchmarking classifiers

In [None]:
from sklearn import metrics
from sklearn.utils.extmath import density
from time import time

def benchmark(clf, custom_name=False):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3f}s")

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print(f"test time:  {test_time:.3f}s")

    score = metrics.accuracy_score(y_test, pred)
    print(f"accuracy:   {score:.3f}")

    if hasattr(clf, "coef_"):
        print(f"dimensionality: {clf.coef_.shape[1]}")
        print(f"density: {density(clf.coef_)}")

    print()
    clf_descr = str(custom_name) if custom_name else clf.__class__.__name__
    return clf_descr, score, train_time, test_time

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

results = []

# Fungsi optional: menghitung densitas bobot
def density(coef):
    return np.mean(coef != 0)

# Fungsi benchmarking
def benchmark(clf, custom_name):
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    precision = precision_score(y_test, pred, average='macro', zero_division=0)
    recall = recall_score(y_test, pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, pred, average='macro', zero_division=0)

    print(f"Model: {custom_name}")
    print(f"Precision:  {precision:.3f}")
    print(f"Recall:     {recall:.3f}")
    print(f"F1-Score:   {f1:.3f}")

    if hasattr(clf, "coef_"):
        shape = clf.coef_.shape
        if len(shape) == 2:
            print(f"Dimensionality: {shape[1]}")
        elif len(shape) == 1:
            print(f"Dimensionality: {shape[0]}")
        print(f"Density: {density(clf.coef_)}")

    print()
    return (custom_name, accuracy, precision, recall, f1)

# Daftar model
models = [
    (LogisticRegression(C=5, max_iter=1000), "Logistic Regression"),
    (RidgeClassifier(alpha=1.0, solver="sparse_cg"), "Ridge Classifier"),
    (KNeighborsClassifier(n_neighbors=5), "k-Nearest Neighbors"),
    (RandomForestClassifier(n_estimators=100, random_state=42), "Random Forest"),
    (LinearSVC(C=0.1, dual=False, max_iter=1000), "Linear SVC"),
    (
        SGDClassifier(loss="log_loss", alpha=1e-4, n_iter_no_change=3, early_stopping=True),
        "Log-loss SGD",
    ),
    (NearestCentroid(), "Nearest Centroid"),
    (ComplementNB(alpha=0.1), "Complement Naive Bayes"),
]

print("=" * 80)
print("Benchmarking berbagai model klasifikasi untuk prediksi mentee_status\n")

for clf, name in models:
    print("=" * 80)
    print(f"Evaluasi: {name}")
    results.append(benchmark(clf, name))


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Model - Model Klasifikasi
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import ComplementNB

In [None]:
X = df_raw['combined_text']  
y = df_raw['mentee_status']  

# Inisialisasi StratifiedKFold dengan 5 fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Menyimpan hasil evaluasi per fold
results = {
    "precision": [],
    "recall": [],
    "f1_score": []
}

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


## Logistic Regression

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


## Ridge Classifier

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")



## KNeighbors Classifier

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


## RandomForestClassifier

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


## LinearSVC

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = LinearSVC(max_iter=1000)
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


## SGDClassifier

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = SGDClassifier(loss="log_loss", max_iter=1000, tol=1e-3, random_state=42)
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = NearestCentroid()
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}:")
    
    # Menampilkan indeks train dan test untuk fold ini
    print(f"  Train indices: {train_index}")
    print(f"  Test indices: {test_index}")
    
    # Membagi data training dan testing berdasarkan fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Vectorisasi teks menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Model klasifikasi 
    clf = ComplementNB()
    clf.fit(X_train_tfidf, y_train)
    
    
    # Prediksi dan evaluasi
    y_pred = clf.predict(X_test_tfidf)
    
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    # Menyimpan hasil evaluasi untuk fold ini
    results["precision"].append(precision)
    results["recall"].append(recall)
    results["f1_score"].append(f1)
    
    # Menampilkan hasil untuk fold ini
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")

# Menampilkan hasil rata-rata dari seluruh fold
print("="*80)
print("Rata-rata hasil evaluasi dari seluruh fold:")
print(f"Mean Precision:  {np.mean(results['precision']):.3f}")
print(f"Mean Recall:     {np.mean(results['recall']):.3f}")
print(f"Mean F1-Score:   {np.mean(results['f1_score']):.3f}")


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

# Data
X = df_raw['combined_text']
y = df_raw['mentee_status']

# TF-IDF vektorisasi
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

# Daftar model yang akan diuji
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Ridge Classifier": RidgeClassifier(tol=1e-2, solver="sparse_cg"),
    "k-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Linear SVC": LinearSVC(max_iter=1000),
    "Log-loss SGD": SGDClassifier(loss="log_loss", max_iter=1000, tol=1e-3, random_state=42),
    "Nearest Centroid": NearestCentroid(),
    "Complement Naive Bayes": ComplementNB()
}

# Dictionary untuk menyimpan hasil seluruh model
all_results = {}

# 10-fold stratified CV
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Loop ke semua model
for model_name, clf in models.items():
    print(f"\nEvaluating: {model_name}")
    results = {
        "precision": [],
        "recall": [],
        "f1_score": []
    }

    for fold, (train_index, test_index) in enumerate(skf.split(X_tfidf, y), 1):
        X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        results["precision"].append(precision_score(y_test, y_pred, average='macro', zero_division=0))
        results["recall"].append(recall_score(y_test, y_pred, average='macro', zero_division=0))
        results["f1_score"].append(f1_score(y_test, y_pred, average='macro', zero_division=0))

        print(f"  Fold {fold}: f1={results['f1_score'][-1]:.3f}")

    # Simpan hasil rata-rata
    all_results[model_name] = {
        "Mean Precision": np.mean(results["precision"]),
        "Mean Recall": np.mean(results["recall"]),
        "Mean F1-Score": np.mean(results["f1_score"])
    }

# Tampilkan hasil akhir
print("\n" + "="*90)
print("Benchmarking Hasil Rata-Rata (10-Fold Cross-Validation):\n")

results_df = pd.DataFrame(all_results).T  # Transpose agar lebih rapi
print(results_df.round(3))


In [None]:
import matplotlib.pyplot as plt

# Urutkan berdasarkan F1-Score tertinggi
results_df_sorted = results_df.sort_values(by="Mean F1-Score", ascending=False)

# Plot bar chart
results_df_sorted.plot(kind='bar', figsize=(12, 6))
plt.title("Perbandingan Kinerja Model (10-Fold CV)")
plt.ylabel("Skor")
plt.ylim(0, 1)
plt.grid(True, linestyle="--", alpha=0.6)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Plot accuracy, training and test time of each classifier

In [None]:
import time
from sklearn.metrics import accuracy_score

def benchmark(clf, custom_name):
    start_train = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start_train

    start_test = time.time()
    pred = clf.predict(X_test)
    test_time = time.time() - start_test

    score = accuracy_score(y_test, pred)
    print(f"Train time: {train_time:.3f}s | Test time: {test_time:.3f}s")
    
    # Kembalikan tuple dengan 4 elemen
    return (custom_name, score, train_time, test_time)


In [None]:
# Menambahkan hasil benchmarking ke dalam results sesuai dengan model yang telah didefinisikan
results.append(benchmark(LogisticRegression(C=5, max_iter=1000), "Logistic Regression"))
results.append(benchmark(RidgeClassifier(alpha=1.0, solver="sparse_cg"), "Ridge Classifier"))
results.append(benchmark(KNeighborsClassifier(n_neighbors=5), "k-Nearest Neighbors"))
results.append(benchmark(RandomForestClassifier(n_estimators=100, random_state=42), "Random Forest"))
results.append(benchmark(LinearSVC(C=0.1, dual=False, max_iter=1000), "Linear SVC"))
results.append(benchmark(SGDClassifier(loss="log_loss", alpha=1e-4, n_iter_no_change=3, early_stopping=True), "Log-loss SGD"))
results.append(benchmark(NearestCentroid(), "Nearest Centroid"))
results.append(benchmark(ComplementNB(alpha=0.1), "Complement Naive Bayes"))


In [None]:
results = [
    ('Logistic Regression', 0.973, 0.013, 0.000),
    ('Ridge Classifier', 0.973, 0.008, 0.000),
    ('k-Nearest Neighbors', 0.907, 0.00, 0.004),
    ('Random Forest', 0.987, 0.314, 0.011),
    ('Linear SVC', 0.973, 0.007, 0.000),
    ('Log-loss SGD', 0.973, 0.009, 0.000),
    ('Nearest Centroid', 0.973, 0.14, 0.001),
    ('Complement Naive Bayes', 0.960, 0.003, 0.000)
]

# Mencetak hasil
for model, accuracy, train_time, test_time in results:
    print(f"('{model}', {accuracy:.4f}, {train_time:.2f}, {test_time:.2f}),")

In [None]:
print(results)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Menyiapkan data dari hasil benchmarking
indices = np.arange(len(results))
results = [[x[i] for x in results] for i in range(4)]
clf_names, score, training_time, test_time = results

training_time = np.array(training_time)
test_time = np.array(test_time)

# Plot: trade-off antara akurasi dan waktu pelatihan
fig, ax1 = plt.subplots(figsize=(10, 8))
ax1.scatter(score, training_time, s=80, color='skyblue')
ax1.set(
    title="Trade-off Akurasi vs Waktu Pelatihan\n(Prediksi Status Mentee - Proyek WBL)",
    yscale="log",
    xlabel="Akurasi Pengujian",
    ylabel="Waktu Pelatihan (detik)",
)
ax1.grid(True)

# Menambahkan label model ke titik scatter
for i, txt in enumerate(clf_names):
    ax1.annotate(txt, (score[i], training_time[i]), fontsize=9)

# Plot: trade-off antara akurasi dan waktu pengujian
fig, ax2 = plt.subplots(figsize=(10, 8))
ax2.scatter(score, test_time, s=80, color='salmon')
ax2.set(
    title="Trade-off Akurasi vs Waktu Pengujian\n(Prediksi Status Mentee - Proyek WBL)",
    yscale="log",
    xlabel="Akurasi Pengujian",
    ylabel="Waktu Pengujian (detik)",
)
ax2.grid(True)

# Menambahkan label model ke titik scatter
for i, txt in enumerate(clf_names):
    ax2.annotate(txt, (score[i], test_time[i]), fontsize=9)
