# **Text Classification dengan TF-IDF dan Word2Vec**

Dalam eksperimen ini, dua pendekatan representasi teks digunakan, yaitu **TF-IDF** dan **Word2Vec**, untuk memprediksi label `mentee_status` dari data teks gabungan (`combined_text`). Beragam model klasifikasi dievaluasi menggunakan **stratified 10-fold cross-validation** untuk menjaga keseimbangan distribusi label di setiap fold.

🔍 **Model yang diuji:**
- Logistic Regression  
- Ridge Classifier  
- k-Nearest Neighbors  
- Random Forest  
- Linear SVC  
- SGDClassifier (Log-loss)  
- Nearest Centroid  
- Complement Naive Bayes  

📏 **Metode evaluasi:**
- Precision  
- Recall  
- F1-Score (Macro Average)  

# 📘 Part 1: Text Classification using TF-IDF ✍️📊

###   Import Library

In [None]:
from time import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6

### Load Dataset dan TF-IDF

In [None]:
def load_wbl_dataset(filepath, verbose=False):
    """
    Load and vectorize the WBL mentee-mentor matching dataset.
    
    Param:
        filepath (str): Path ke CSV file data kamu.
        verbose (bool): Untuk menampilkan informasi proses.

    Return:
        X (TF-IDF matrix), y (labels), feature_names (list of terms), target_names (target labels), df (dataframe asli)
    """
    # 1. Load data kamu dari CSV
    df = pd.read_csv(filepath)

    # 2. Gabungkan semua kolom teks relevan jadi satu kolom 'combined_text'
    text_cols = [
        'spv_mentee', 'job_position', 'required_tools', 'required_skills', 
        'required_role_title', 'mentee_name', 'mentee_title', 'mentee_skill',
        'mentee_tools', 'mentee_position'
    ]
    df.fillna('', inplace=True)  # Hindari NaN
    df['combined_text'] = df[text_cols].agg(' '.join, axis=1)

    # 3. Target labels (misalnya 'mentee_status' untuk label target)
    y = df['mentee_status']
    target_names = y.unique()  # Mendapatkan nama kategori target (misalnya status mentee)
    
    # 4. TF-IDF vectorization
    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(df['combined_text'])
    feature_names = vectorizer.get_feature_names_out()
    duration = time() - t0

    if verbose:
        print(f"Dokumen: {len(df)}, Fitur: {X.shape[1]}")
        print(f"TF-IDF selesai dalam {duration:.2f} detik")

    return X, y, feature_names, target_names, df

### Load & Split Dataset

In [None]:
X, y, feature_names, target_names, df_raw = load_wbl_dataset("*/transformed_src/df_lowercased.csv", verbose=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Benchmarking Model Klasifikasi dengan Representasi TF-IDF dan 10-Fold Cross-Validation

Berikut adalah hasil benchmark tiap model:

| 🧠 Model               | 🎯 Precision | 🔁 Recall | 🏆 F1-Score |
| ---------------------- | ------------ | --------- | ----------- |
| **Ridge Classifier**   | 0.983        | 0.980     | **0.981**   |
| **Random Forest**      | 0.982        | 0.982     | **0.981**   |
| Log-loss SGD           | 0.976        | 0.972     | 0.973       |
| Logistic Regression    | 0.973        | 0.970     | 0.970       |
| Linear SVC             | 0.973        | 0.970     | 0.970       |
| Complement Naive Bayes | 0.960        | 0.956     | 0.957       |
| Nearest Centroid       | 0.935        | 0.934     | 0.933       |
| k-Nearest Neighbors    | 0.929        | 0.909     | 0.911       |


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

# Data
X = df_raw['combined_text']
y = df_raw['mentee_status']

# TF-IDF vektorisasi
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=2, stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

# Daftar model yang akan diuji
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Ridge Classifier": RidgeClassifier(tol=1e-2, solver="sparse_cg"),
    "k-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Linear SVC": LinearSVC(max_iter=1000),
    "Log-loss SGD": SGDClassifier(loss="log_loss", max_iter=1000, tol=1e-3, random_state=42),
    "Nearest Centroid": NearestCentroid(),
    "Complement Naive Bayes": ComplementNB()
}

# Dictionary untuk menyimpan hasil seluruh model
all_results = {}

# 10-fold stratified CV
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Loop ke semua model
for model_name, clf in models.items():
    print(f"\nEvaluating: {model_name}")
    results = {
        "precision": [],
        "recall": [],
        "f1_score": []
    }

    for fold, (train_index, test_index) in enumerate(skf.split(X_tfidf, y), 1):
        X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        results["precision"].append(precision_score(y_test, y_pred, average='macro', zero_division=0))
        results["recall"].append(recall_score(y_test, y_pred, average='macro', zero_division=0))
        results["f1_score"].append(f1_score(y_test, y_pred, average='macro', zero_division=0))

        print(f"  Fold {fold}: f1={results['f1_score'][-1]:.3f}")

    # Simpan hasil rata-rata
    all_results[model_name] = {
        "Mean Precision": np.mean(results["precision"]),
        "Mean Recall": np.mean(results["recall"]),
        "Mean F1-Score": np.mean(results["f1_score"])
    }

# Tampilkan hasil akhir
print("\n" + "="*90)
print("Benchmarking Hasil Rata-Rata (10-Fold Cross-Validation):\n")

results_df = pd.DataFrame(all_results).T  # Transpose agar lebih rapi
print(results_df.round(3))


### Visualisasi Perbandingan Kinerja Model (10-Fold CV)

##### 🔝 Performa Terbaik
- **Random Forest**, **Ridge Classifier**, **Log-loss SGD**, dan **Logistic Regression** punya skor Precision, Recall, dan F1 tertinggi (~0.98–0.99).
- Cocok untuk klasifikasi teks dengan performa stabil.

##### ⚖️ Performa Menengah
- **Linear SVC** dan **Complement Naive Bayes** masih cukup bagus, meski sedikit di bawah empat model teratas.

##### 🔻 Performa Terendah
- **K-Nearest Neighbors (KNN)** dan **Nearest Centroid** menunjukkan performa paling rendah.
- Kurang cocok untuk kasus teks ini.

##### ✅ Rekomendasi
Gunakan **Ridge Classifier** atau **Logistic Regression** untuk hasil optimal.


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Urutkan data berdasarkan F1-Score tertinggi
results_df_sorted = results_df.sort_values(by="Mean F1-Score", ascending=False)

# Ambil list model dan metrik
models = results_df_sorted.index.tolist()
metrics = ["Mean Precision", "Mean Recall", "Mean F1-Score"]

# Data untuk setiap metrik
data = [results_df_sorted[metric].values for metric in metrics]

# Setup posisi bar
bar_width = 0.25
index = np.arange(len(models))

fig, ax = plt.subplots(figsize=(12, 6))

# Plot bar untuk tiap metrik per model
for i, metric in enumerate(metrics):
    ax.bar(index + i * bar_width, data[i], bar_width, label=metric)

# Label dan judul
ax.set_xlabel("Models")
ax.set_ylabel("Skor")
ax.set_title("Perbandingan Kinerja Model (10-Fold CV)")
ax.set_xticks(index + bar_width)
ax.set_xticklabels(models, rotation=45, ha="right")
ax.set_ylim(0, 1.05)  # agar ada ruang untuk label nilai

# Grid untuk memudahkan pembacaan
ax.grid(True, linestyle="--", alpha=0.6, axis='y')

# Tambahkan nilai di atas setiap bar
for i in range(len(models)):
    for j in range(len(metrics)):
        ax.text(index[i] + j*bar_width, data[j][i] + 0.02, f"{data[j][i]:.3f}",
                ha='center', va='bottom', fontsize=8)

ax.legend()
plt.tight_layout()
plt.show()


# 📗 Part 2: Text Classification using Word2Vec 🧠🗣️

### Data Preparation & Text Merging

In [None]:
import pandas as pd
import os

# Load file yang sudah disimpan
file_path = "*/transformed_src/df_lowercased.csv"
df_lowercased = pd.read_csv(file_path)

# Gabungkan kolom-kolom menjadi satu kolom 'text'
df_lowercased['text'] = df_lowercased[['spv_mentee', 'job_position', 'required_tools', 
                                       'required_skills', 'required_role_title', 'mentee_name', 
                                       'mentee_title', 'mentee_skill', 'mentee_tools', 
                                       'mentee_position', 'mentee_status']
                                      ].astype(str).agg(' '.join, axis=1)

# Buat dataframe akhir untuk modeling
df_combined = df_lowercased[['text', 'mentee_status']].rename(columns={'mentee_status': 'labels'})

### Label Encoding & Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_combined['labels'] = encoder.fit_transform(df_combined['labels'])

X = df_combined['text']
y = df_combined['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Setup NLTK Data Path

In [None]:
import nltk

# Hapus semua path sebelumnya
nltk.data.path.clear()

# Tambahkan path yang benar
nltk.data.path.append('*/nltk_data')

# Verifikasi path yang digunakan
print(nltk.data.path)

### Text Preprocessing Function

In [None]:
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords

nltk.data.path.append('*/nltk_data')

nltk.download('punkt', download_dir='*/nltk_data')
nltk.download('stopwords', download_dir='*/nltk_data')
nltk.download('punkt_tab', download_dir='*/nltk_data')

# Fungsi preprocessing tanpa stemming
def preprocess(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Menghapus karakter non-alfabet
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Menurunkan semua huruf ke kecil
    text = text.lower()
    
    # Tentukan bahasa berdasarkan kehadiran kata-kata bahasa Indonesia atau Inggris
    stop_words_en = set(stopwords.words('english'))
    stop_words_id = set(stopwords.words('indonesian'))
    
    # Tentukan stopwords dan bahasa
    words = nltk.word_tokenize(text)
    language = 'id' if any(word in stop_words_id for word in words) else 'en'

    if language == 'en':
        stop_words = stop_words_en
    else:
        stop_words = stop_words_id
    
    # Tokenisasi dan hapus stopwords
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)


### Apply Preprocessing to Train & Test Data

In [None]:
X_train = pd.Series(X_train)
X_test = pd.Series(X_test)

# Tidak usah isi NaN, biarkan tetap kosong
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

### Train Word2Vec Model

In [None]:
from gensim.models import Word2Vec

VECTOR_SIZE = 100
WINDOW = 50
MIN_COUNT = 5

sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, vector_size=VECTOR_SIZE, window=WINDOW, min_count=MIN_COUNT)

### Text Vectorization

In [None]:
import numpy as np

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(VECTOR_SIZE)
    return np.mean(words_vecs, axis=0)

X_train_vec = np.array([vectorize(sentence) for sentence in X_train])
X_test_vec = np.array([vectorize(sentence) for sentence in X_test])


### Benchmarking Model Klasifikasi dengan Representasi Word2Vec dan 10-Fold Cross-Validation

| Model               | Mean Precision | Mean Recall | Mean F1-Score | Avg Train Time (s) | Avg Test Time (s) |
|---------------------|----------------|-------------|---------------|--------------------|-------------------|
| Logistic Regression  | 0.799          | 0.790       | 0.790         | 0.005              | 0.000             |
| Ridge Classifier     | 0.796          | 0.784       | 0.785         | 0.002              | 0.000             |
| k-Nearest Neighbors  | 0.932          | 0.930       | 0.930         | 0.001              | 0.002             |
| Random Forest       | **0.982**      | **0.981**   | **0.981**     | 0.226              | 0.008             |
| Linear SVC           | 0.778          | 0.706       | 0.691         | 0.002              | 0.000             |
| Log-loss SGD         | 0.498          | 0.541       | 0.418         | 0.004              | 0.000             |
| Nearest Centroid     | 0.756          | 0.751       | 0.751         | 0.001              | 0.001             |



> **Kesimpulan singkat:**
> - Random Forest tampil sebagai model terbaik dengan performa F1-Score tertinggi (0.981) meski waktu trainingnya paling lama.
> - k-Nearest Neighbors juga sangat baik dengan F1-Score 0.930 dan waktu training/testing yang sangat cepat.
> - Log-loss SGD memiliki performa paling rendah di antara model yang diuji.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
from time import time

# Daftar model
models = {
    "Logistic Regression": LogisticRegression(C=5, max_iter=1000),
    "Ridge Classifier": RidgeClassifier(alpha=1.0, solver="sparse_cg"),
    "k-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Linear SVC": LinearSVC(C=0.1, dual=False, max_iter=1000),
    "Log-loss SGD": SGDClassifier(loss="log_loss", alpha=1e-4, n_iter_no_change=3, early_stopping=True),
    "Nearest Centroid": NearestCentroid()
}

# Cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
all_results = {}

X_clean = X.apply(preprocess)
X_w2v = np.array([vectorize(sentence) for sentence in X_clean])


# Uji semua model
for model_name, clf in models.items():
    print(f"\nEvaluating: {model_name}")
    results = {
        "precision": [],
        "recall": [],
        "f1_score": [],
        "train_time": [],
        "test_time": []
    }

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_w2v, y), 1):
        X_train, X_test = X_w2v[train_idx], X_w2v[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        t0 = time()
        clf.fit(X_train, y_train)
        train_duration = time() - t0

        t0 = time()
        y_pred = clf.predict(X_test)
        test_duration = time() - t0

        prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

        results["precision"].append(prec)
        results["recall"].append(rec)
        results["f1_score"].append(f1)
        results["train_time"].append(train_duration)
        results["test_time"].append(test_duration)

        print(f"  f1={f1:.3f}")

    # Simpan rata-rata hasil
    all_results[model_name] = {
        "Mean Precision": np.mean(results["precision"]),
        "Mean Recall": np.mean(results["recall"]),
        "Mean F1-Score": np.mean(results["f1_score"]),
        "Avg Train Time": np.mean(results["train_time"]),
        "Avg Test Time": np.mean(results["test_time"]),
    }

# Tampilkan hasil akhir dalam bentuk DataFrame
print("\n" + "="*90)
print("Benchmarking Hasil Rata-Rata (Word2Vec + 10-Fold Cross-Validation):\n")

results_df = pd.DataFrame(all_results).T
print(results_df.round(3))


### Visualisasi Perbandingan Kinerja Model (10-Fold CV)

##### 🔝 Performa Terbaik

* **Random Forest** dan **k-Nearest Neighbors** unggul dengan skor F1 di atas 0.90.
* Cocok untuk tugas klasifikasi dengan hasil paling stabil dan akurat.

##### ⚖️ Performa Menengah

* **Logistic Regression**, **Ridge Classifier**, dan **Nearest Centroid** memiliki performa cukup baik dengan skor F1 sekitar 0.75-0.79.

##### 🔻 Performa Terendah

* **Linear SVC** dan **Log-loss SGD** menunjukkan performa paling rendah, terutama Log-loss SGD dengan F1 sekitar 0.42.
* Kurang direkomendasikan untuk data ini.

##### ✅ Rekomendasi

Gunakan **Random Forest** untuk hasil terbaik, atau **k-Nearest Neighbors** sebagai alternatif kuat.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Ambil data hasil benchmark
results_df = pd.DataFrame(all_results).T

# Urutan model dan metrik
models = results_df.index.tolist()
metrics = ['Mean Precision', 'Mean Recall', 'Mean F1-Score']

# Data untuk setiap metrik
data = [results_df[metric].values for metric in metrics]

# Setup posisi bar
bar_width = 0.25
index = np.arange(len(models))

# Membuat figure dan axis
fig, ax = plt.subplots(figsize=(14, 6))

# Plot tiap metrik sebagai bar yang berdampingan
for i, metric in enumerate(metrics):
    ax.bar(index + i * bar_width, data[i], bar_width, label=metric)

# Label dan judul
ax.set_xlabel('Models')
ax.set_ylabel('Score')
ax.set_title('Perbandingan Kinerja Model (10-Fold CV)')
ax.set_xticks(index + bar_width)
ax.set_xticklabels(models, rotation=45, ha='right')
ax.set_ylim(0, 1.05)  # supaya ada ruang di atas

# Tambahkan legend
ax.legend()

# Tampilkan nilai di atas setiap bar
for i in range(len(models)):
    for j in range(len(metrics)):
        ax.text(index[i] + j*bar_width, data[j][i] + 0.02, f"{data[j][i]:.3f}", ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

# Visualisasi waktu training dan testing (opsional)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training time
axes[0].barh(results_df.index, results_df['Avg Train Time'], color=colors)
axes[0].set_title('Average Training Time (seconds)')
axes[0].invert_yaxis()
for idx, val in enumerate(results_df['Avg Train Time']):
    axes[0].text(val + max(results_df['Avg Train Time'])*0.01, idx, f"{val:.3f}", va='center')

# Testing time
axes[1].barh(results_df.index, results_df['Avg Test Time'], color=colors)
axes[1].set_title('Average Testing Time (seconds)')
axes[1].invert_yaxis()
for idx, val in enumerate(results_df['Avg Test Time']):
    axes[1].text(val + max(results_df['Avg Test Time'])*0.01, idx, f"{val:.3f}", va='center')

plt.suptitle('Comparison of Model Computational Time', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
