In [2]:
import os

DATA_FILE = 'dataset.csv'
MODEL_FILE = 'intent_model.pkl'

In [11]:
import pandas as pd

df = pd.read_csv(DATA_FILE)

print(df.head())

print(f"\nTotal Baris: {len(df)}")
print("Distribusi Intent:")
print(df['intent'].value_counts())

                         text           intent
0                   cancel it   cancel_booking
1     coworking space booking  booking_request
2      Communal 3 on tomorrow  booking_request
3              get communal 5  booking_request
4  stop the booking for Dapur   cancel_booking

Total Baris: 360
Distribusi Intent:
intent
cancel_booking        120
booking_request       120
check_availability    120
Name: count, dtype: int64


In [12]:
import re

def clean_text(text):
    text = str(text).lower()
    # Hanya mengizinkan (a-z)(0-9)(.)( )
    text = re.sub(r'[^a-z0-9\.\s]', '', text) 
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

print(df[['text', 'cleaned_text']].head(3))

                      text             cleaned_text
0                cancel it                cancel it
1  coworking space booking  coworking space booking
2   Communal 3 on tomorrow   communal 3 on tomorrow


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], 
    df['intent'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['intent']
)

print(f"Training: {len(X_train)} baris, Testing: {len(X_test)} baris.")

Training: 288 baris, Testing: 72 baris.


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Membuat Pipeline:
# 1. Ubah teks jadi angka (TF-IDF)
# 2. Klasifikasi pakai SVM

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))), # Baca 1 kata dan 2 kata berurutan
    ('clf', SVC(kernel='linear', probability=True)) # Probability=True agar nanti bisa lihat confidence score
])

pipeline.fit(X_train, y_train)

In [15]:
from sklearn.metrics import classification_report

predictions = pipeline.predict(X_test)

print(classification_report(y_test, predictions, zero_division=0))

                    precision    recall  f1-score   support

   booking_request       1.00      0.96      0.98        24
    cancel_booking       1.00      1.00      1.00        24
check_availability       0.96      1.00      0.98        24

          accuracy                           0.99        72
         macro avg       0.99      0.99      0.99        72
      weighted avg       0.99      0.99      0.99        72



In [16]:
import pickle

print(f"Menyimpan model ke '{MODEL_FILE}'...")
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(pipeline, f)

Menyimpan model ke 'intent_model.pkl'...
