# Text Classification with Word2Vec Embedding plus Classifier

In [None]:
import pandas as pd
import os

# Load file yang sudah disimpan
file_path = "*/transformed_src/df_lowercased.csv"
df_lowercased = pd.read_csv(file_path)

# Gabungkan kolom-kolom menjadi satu kolom 'text'
df_lowercased['text'] = df_lowercased[['spv_mentee', 'job_position', 'required_tools', 
                                       'required_skills', 'required_role_title', 'mentee_name', 
                                       'mentee_title', 'mentee_skill', 'mentee_tools', 
                                       'mentee_position', 'mentee_status']
                                      ].astype(str).agg(' '.join, axis=1)

# Buat dataframe akhir untuk modeling
df_combined = df_lowercased[['text', 'mentee_status']].rename(columns={'mentee_status': 'labels'})

In [None]:
df_combined

Classification of the Dataset with Word2Vec

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df_combined['labels'] = encoder.fit_transform(df_combined['labels'])

X = df_combined['text']
y = df_combined['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
df_combined

In [None]:
import nltk
print(nltk.data.path)

In [None]:
import nltk

# Hapus semua path sebelumnya
nltk.data.path.clear()

# Tambahkan path yang benar
nltk.data.path.append('*/nltk_data')

# Verifikasi path yang digunakan
print(nltk.data.path)

In [None]:
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords

nltk.data.path.append('*/nltk_data')

nltk.download('punkt', download_dir='*/nltk_data')
nltk.download('stopwords', download_dir='*/nltk_data')
nltk.download('punkt_tab', download_dir='*/nltk_data')

# Fungsi preprocessing tanpa stemming
def preprocess(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Menghapus karakter non-alfabet
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Menurunkan semua huruf ke kecil
    text = text.lower()
    
    # Tentukan bahasa berdasarkan kehadiran kata-kata bahasa Indonesia atau Inggris
    stop_words_en = set(stopwords.words('english'))
    stop_words_id = set(stopwords.words('indonesian'))
    
    # Tentukan stopwords dan bahasa
    words = nltk.word_tokenize(text)
    language = 'id' if any(word in stop_words_id for word in words) else 'en'

    if language == 'en':
        stop_words = stop_words_en
    else:
        stop_words = stop_words_id
    
    # Tokenisasi dan hapus stopwords
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)


In [None]:
X_train = pd.Series(X_train)
X_test = pd.Series(X_test)

# Tidak usah isi NaN, biarkan tetap kosong
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [None]:
from gensim.models import Word2Vec

VECTOR_SIZE = 100
WINDOW = 50
MIN_COUNT = 5

sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, vector_size=VECTOR_SIZE, window=WINDOW, min_count=MIN_COUNT)


In [None]:
import numpy as np

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(VECTOR_SIZE)
    return np.mean(words_vecs, axis=0)

X_train_vec = np.array([vectorize(sentence) for sentence in X_train])
X_test_vec = np.array([vectorize(sentence) for sentence in X_test])


In [None]:
# Mengecek apakah ada nilai negatif di X_train_vec dan X_test_vec
print("Cek nilai negatif di X_train_vec:")
print(f"Ada nilai negatif di X_train_vec: {np.any(X_train_vec < 0)}")
print(f"Jumlah nilai negatif di X_train_vec: {np.sum(X_train_vec < 0)}")

print("\nCek nilai negatif di X_test_vec:")
print(f"Ada nilai negatif di X_test_vec: {np.any(X_test_vec < 0)}")
print(f"Jumlah nilai negatif di X_test_vec: {np.sum(X_test_vec < 0)}")


In [None]:
import numpy as np

# Cek apakah ada nilai negatif atau positif di X_train_vec dan X_test_vec
print("Cek nilai negatif dan positif di X_train_vec:")

# Mengecek nilai negatif
negative_values_train = X_train_vec < 0
positive_values_train = X_train_vec > 0

# Menampilkan jumlah nilai negatif dan positif
print(f"Jumlah nilai negatif di X_train_vec: {np.sum(negative_values_train)}")
print(f"Jumlah nilai positif di X_train_vec: {np.sum(positive_values_train)}")

# Cek X_test_vec
print("\nCek nilai negatif dan positif di X_test_vec:")

negative_values_test = X_test_vec < 0
positive_values_test = X_test_vec > 0

# Menampilkan jumlah nilai negatif dan positif
print(f"Jumlah nilai negatif di X_test_vec: {np.sum(negative_values_test)}")
print(f"Jumlah nilai positif di X_test_vec: {np.sum(positive_values_test)}")


In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from time import time

# Fungsi Benchmarking
def benchmark(clf, X_train, y_train, X_test, y_test, custom_name=False):
    print("_" * 80)
    print(f"Training: {clf}")
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3f}s")

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print(f"test time:  {test_time:.3f}s")

    score = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='macro', zero_division=0)
    recall = recall_score(y_test, pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, pred, average='macro', zero_division=0)
    
    print(f"accuracy:   {score:.3f}")
    print(f"precision:  {precision:.3f}")
    print(f"recall:     {recall:.3f}")
    print(f"f1-score:   {f1:.3f}")

    print()
    clf_descr = str(custom_name) if custom_name else clf.__class__.__name__
    return clf_descr, score, train_time, test_time


# Daftar Model tanpa Naive Bayes dan Confusion Matrix
models = [
    (LogisticRegression(C=5, max_iter=1000), "Logistic Regression"),
    (RidgeClassifier(alpha=1.0, solver="sparse_cg"), "Ridge Classifier"),
    (KNeighborsClassifier(n_neighbors=5), "k-Nearest Neighbors"),
    (RandomForestClassifier(n_estimators=100, random_state=42), "Random Forest"),
    (LinearSVC(C=0.1, dual=False, max_iter=1000), "Linear SVC"),
    (SGDClassifier(loss="log_loss", alpha=1e-4, n_iter_no_change=3, early_stopping=True), "Log-loss SGD"),
    (NearestCentroid(), "Nearest Centroid")
]

# Evaluasi Semua Model
print("=" * 80)
print("Benchmarking Model untuk Word2Vec\n")

# Misalnya X_train_vec, X_test_vec adalah hasil representasi Word2Vec untuk data training dan testing
results = []
for clf, name in models:
    print("=" * 80)
    print(f"Evaluasi: {name}")
    clf_descr, score, train_time, test_time = benchmark(clf, X_train_vec, y_train, X_test_vec, y_test, name)
    results.append((clf_descr, score, train_time, test_time))

# Jika ingin melihat hasilnya lebih lanjut
for result in results:
    print(result)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

# Latih model Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Prediksi menggunakan X_test_vec
y_pred_lr = clf.predict(X_test_vec)

# Evaluasi model
precision = precision_score(y_test, y_pred_lr, average='macro')
recall = recall_score(y_test, y_pred_lr, average='macro')
f1 = f1_score(y_test, y_pred_lr, average='macro')

# Menampilkan hasil evaluasi
print('Model Logistic Regression untuk Prediksi Status Mentee')
print(f'Precision: {round(precision, 3)}')
print(f'Recall: {round(recall, 3)}')
print(f'F1-Score: {round(f1, 3)}')


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_vec, y_train)
y_pred_rf = rf.predict(X_test_vec)


In [None]:
# Evaluasi
precision = precision_score(y_test, y_pred_rf, average='macro')
recall = recall_score(y_test, y_pred_rf, average='macro')
f1 = f1_score(y_test, y_pred_rf, average='macro')

print(f"RandomForest / Precision: {round(precision,3)} / Recall: {round(recall,3)} / F1-Score: {round(f1,3)}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf, labels=rf.classes_)
ax = sns.heatmap(cm, annot=True)
ax.set_title('Confusion Matrix (Word2Vec + RandomForest)', fontsize=16)
ax.set_xlabel("Predicted")
ax.set_ylabel("Target")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
from time import time

# Daftar model
models = {
    "Logistic Regression": LogisticRegression(C=5, max_iter=1000),
    "Ridge Classifier": RidgeClassifier(alpha=1.0, solver="sparse_cg"),
    "k-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Linear SVC": LinearSVC(C=0.1, dual=False, max_iter=1000),
    "Log-loss SGD": SGDClassifier(loss="log_loss", alpha=1e-4, n_iter_no_change=3, early_stopping=True),
    "Nearest Centroid": NearestCentroid()
}

# Cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
all_results = {}

X_clean = X.apply(preprocess)
X_w2v = np.array([vectorize(sentence) for sentence in X_clean])


# Uji semua model
for model_name, clf in models.items():
    print(f"\nEvaluating: {model_name}")
    results = {
        "precision": [],
        "recall": [],
        "f1_score": [],
        "train_time": [],
        "test_time": []
    }

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_w2v, y), 1):
        X_train, X_test = X_w2v[train_idx], X_w2v[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        t0 = time()
        clf.fit(X_train, y_train)
        train_duration = time() - t0

        t0 = time()
        y_pred = clf.predict(X_test)
        test_duration = time() - t0

        prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
        rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

        results["precision"].append(prec)
        results["recall"].append(rec)
        results["f1_score"].append(f1)
        results["train_time"].append(train_duration)
        results["test_time"].append(test_duration)

        print(f"  Fold {fold}: acc={acc:.3f}, f1={f1:.3f}")

    # Simpan rata-rata hasil
    all_results[model_name] = {
        "Mean Precision": np.mean(results["precision"]),
        "Mean Recall": np.mean(results["recall"]),
        "Mean F1-Score": np.mean(results["f1_score"]),
        "Avg Train Time": np.mean(results["train_time"]),
        "Avg Test Time": np.mean(results["test_time"]),
    }

# Tampilkan hasil akhir dalam bentuk DataFrame
print("\n" + "="*90)
print("Benchmarking Hasil Rata-Rata (Word2Vec + 10-Fold Cross-Validation):\n")

results_df = pd.DataFrame(all_results).T
print(results_df.round(3))
