In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import re

In [4]:
file_path = "data/intent.xlsx"
data = pd.read_excel(file_path)

data = data [["text", "intent"]]
data.head()

Unnamed: 0,text,intent
0,halo mlibbot,salam
1,selamat pagi perpustakaan,salam
2,hai selamat siang,salam
3,makasih ya mlibbot,salam
4,terima kasih atas bantuannya,salam


In [5]:
print(data.columns)
data["intent"].value_counts()

Index(['text', 'intent'], dtype='object')


intent
salam                        28
tanya_fungsi_mlibbot         25
donasi_buku                  25
layanan_turnitin             25
layanan_ejournal_ebook       25
layanan_ruang_diskusi        25
tata_tertib                  25
info_denda                   25
panduan_perpanjangan         25
panduan_pengembalian         25
panduan_peminjaman           25
lokasi_perpustakaan          25
jam_buka                     25
lokasi_buku_rak              25
cek_ketersediaan_buku        25
cari_buku_isbn_callnumber    25
cari_buku_topik              25
cari_buku_penulis            25
cari_buku_judul              25
akses_repository             25
Name: count, dtype: int64

In [6]:
def preprocess(text: str) -> str:
    if not isinstance(text, str):
        text = str(text)

    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[^0-9a-zA-ZÀ-ÿ\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data["hasil"] = data["text"].apply(preprocess)
data[["text", "hasil"]].head(20)

Unnamed: 0,text,hasil
0,halo mlibbot,halo mlibbot
1,selamat pagi perpustakaan,selamat pagi perpustakaan
2,hai selamat siang,hai selamat siang
3,makasih ya mlibbot,makasih ya mlibbot
4,terima kasih atas bantuannya,terima kasih atas bantuannya
5,halo,halo
6,hai,hai
7,hey,hey
8,kamu itu bisa bantu apa aja sih,kamu itu bisa bantu apa aja sih
9,mlibbot fungsinya apa,mlibbot fungsinya apa


In [7]:
data.isnull().sum()

text      0
intent    0
hasil     0
dtype: int64

In [8]:
data = data.dropna()

In [9]:
data.isnull().sum()

text      0
intent    0
hasil     0
dtype: int64

In [10]:
data

Unnamed: 0,text,intent,hasil
0,halo mlibbot,salam,halo mlibbot
1,selamat pagi perpustakaan,salam,selamat pagi perpustakaan
2,hai selamat siang,salam,hai selamat siang
3,makasih ya mlibbot,salam,makasih ya mlibbot
4,terima kasih atas bantuannya,salam,terima kasih atas bantuannya
...,...,...,...
498,"maaf, maaf, repository menyediakan sitasi otom...",akses_repository,maaf maaf repository menyediakan sitasi otomat...
499,mau tanya tolong jelaskan apakah ada pembatasa...,akses_repository,mau tanya tolong jelaskan apakah ada pembatasa...
500,mohon info bisa ga download file pdf skripsi l...,akses_repository,mohon info bisa ga download file pdf skripsi l...
501,"maaf, maaf, mau tanya kalau mau cari judul ber...",akses_repository,maaf maaf mau tanya kalau mau cari judul berda...


In [11]:
data = data[["hasil", "intent"]]

In [12]:
data

Unnamed: 0,hasil,intent
0,halo mlibbot,salam
1,selamat pagi perpustakaan,salam
2,hai selamat siang,salam
3,makasih ya mlibbot,salam
4,terima kasih atas bantuannya,salam
...,...,...
498,maaf maaf repository menyediakan sitasi otomat...,akses_repository
499,mau tanya tolong jelaskan apakah ada pembatasa...,akses_repository
500,mohon info bisa ga download file pdf skripsi l...,akses_repository
501,maaf maaf mau tanya kalau mau cari judul berda...,akses_repository


In [13]:
X = data["hasil"].astype(str).tolist()
y = data["intent"].astype(str).tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

len(X_train), len(X_test)

(402, 101)

In [14]:
pipe_logreg = Pipeline([
    ("tfidf", TfidfVectorizer(
        preprocessor=None,    
        lowercase=False        
    )),
    ("clf", LogisticRegression(
        max_iter=500,
        n_jobs=-1
    ))
])

param_grid_logreg = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [1, 2],
    "clf__C": [0.1, 1.0, 5.0]
}

grid_logreg = GridSearchCV(
    pipe_logreg,
    param_grid_logreg,
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_logreg.fit(X_train, y_train)

print("Best params (LogReg):", grid_logreg.best_params_)
print("Best CV score (LogReg):", grid_logreg.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params (LogReg): {'clf__C': 1.0, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
Best CV score (LogReg): 0.840925925925926


In [15]:
best_logreg = grid_logreg.best_estimator_

y_pred_logreg = best_logreg.predict(X_test)
acc_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Test Accuracy (LogReg): {acc_logreg:.3f}\n")

print("Classification Report (LogReg):")
print(classification_report(y_test, y_pred_logreg))

Test Accuracy (LogReg): 0.891

Classification Report (LogReg):
                           precision    recall  f1-score   support

         akses_repository       1.00      0.80      0.89         5
cari_buku_isbn_callnumber       1.00      1.00      1.00         5
          cari_buku_judul       0.60      0.60      0.60         5
        cari_buku_penulis       0.80      0.80      0.80         5
          cari_buku_topik       1.00      0.80      0.89         5
    cek_ketersediaan_buku       0.57      0.80      0.67         5
              donasi_buku       1.00      0.80      0.89         5
               info_denda       1.00      1.00      1.00         5
                 jam_buka       0.71      1.00      0.83         5
   layanan_ejournal_ebook       1.00      1.00      1.00         5
    layanan_ruang_diskusi       1.00      1.00      1.00         5
         layanan_turnitin       1.00      1.00      1.00         5
          lokasi_buku_rak       0.80      0.80      0.80         

In [16]:
pipe_nb = Pipeline([
    ("tfidf", TfidfVectorizer(
        preprocessor=None,
        lowercase=False
    )),
    ("clf", MultinomialNB())
])

param_grid_nb = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [1, 2],
    "clf__alpha": [0.1, 0.5, 1.0]
}

grid_nb = GridSearchCV(
    pipe_nb,
    param_grid_nb,
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_nb.fit(X_train, y_train)

print("Best params (NB):", grid_nb.best_params_)
print("Best CV score (NB):", grid_nb.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params (NB): {'clf__alpha': 1.0, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
Best CV score (NB): 0.8259876543209878


In [17]:
best_nb = grid_nb.best_estimator_

y_pred_nb = best_nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)
print(f"Test Accuracy (Naive Bayes): {acc_nb:.3f}\n")

print("Classification Report (Naive Bayes):")
print(classification_report(y_test, y_pred_nb))

Test Accuracy (Naive Bayes): 0.891

Classification Report (Naive Bayes):
                           precision    recall  f1-score   support

         akses_repository       1.00      0.80      0.89         5
cari_buku_isbn_callnumber       1.00      1.00      1.00         5
          cari_buku_judul       0.75      0.60      0.67         5
        cari_buku_penulis       0.80      0.80      0.80         5
          cari_buku_topik       1.00      0.80      0.89         5
    cek_ketersediaan_buku       0.57      0.80      0.67         5
              donasi_buku       1.00      0.80      0.89         5
               info_denda       1.00      1.00      1.00         5
                 jam_buka       0.71      1.00      0.83         5
   layanan_ejournal_ebook       1.00      1.00      1.00         5
    layanan_ruang_diskusi       1.00      1.00      1.00         5
         layanan_turnitin       1.00      1.00      1.00         5
          lokasi_buku_rak       0.80      0.80      0.8

In [18]:
print(f"LogReg Test Accuracy : {acc_logreg:.3f}")
print(f"Naive Bayes Test Accuracy : {acc_nb:.3f}")

if acc_logreg >= acc_nb:
    best_model_name = "logreg"
    final_model = best_logreg
else:
    best_model_name = "naive_bayes"
    final_model = best_nb

print("Chosen model:", best_model_name)

model_path = f"vectorstore/intent_model_{best_model_name}.pkl"
joblib.dump(final_model, model_path)

print("Saved to:", model_path)

LogReg Test Accuracy : 0.891
Naive Bayes Test Accuracy : 0.891
Chosen model: logreg
Saved to: vectorstore/intent_model_logreg.pkl


In [19]:
def predict_intent_sentence(s):
    s_clean = preprocess(s)
    return final_model.predict([s_clean])[0]

tests = [
    "jam buka perpustakaan hari sabtu",
    "ada buku basis data fathansyah gak",
    "cara booking ruang diskusi gimana",
    "kalau telat balikin buku dendanya berapa",
    "cara akses e journal dari luar kampus",
    "halo mlibbot",
]

for t in tests:
    print(f"{t!r} -> {predict_intent_sentence(t)}")

'jam buka perpustakaan hari sabtu' -> jam_buka
'ada buku basis data fathansyah gak' -> cari_buku_judul
'cara booking ruang diskusi gimana' -> layanan_ruang_diskusi
'kalau telat balikin buku dendanya berapa' -> info_denda
'cara akses e journal dari luar kampus' -> layanan_ejournal_ebook
'halo mlibbot' -> salam
