In [1]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/209.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [2]:
import requests
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
# Step 1: Muat dataset JSON
url = "https://raw.githubusercontent.com/Nocturnailed-Community/Pamolah-Intelegent/refs/heads/main/Datasets/NLP/augmented_realistic_dataset.json"
response = requests.get(url)

if response.status_code == 200:
    dataset = response.json()
    print("Dataset berhasil diambil dari URL!")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")
    dataset = None

if dataset:
    with open("augmented_realistic_dataset.json", "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
    print("Dataset berhasil disimpan ke file lokal.")

Dataset berhasil diambil dari URL!
Dataset berhasil disimpan ke file lokal.


In [4]:
# Step 2: Ambil data dan label
texts = []
labels = []

for entry in dataset:
    # Gabungkan semua fitur teks menjadi satu input
    text = f"{entry['keluhan_umum']} {entry['lokasi_nyeri']} {entry['durasi_masalah']} {entry['gejala_tambahan']}"
    texts.append(text)
    labels.append(entry['kelas'])

# Step 3: Encoding label menjadi format numerik
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Step 4: Preprocess dengan menghilangkan stop words menggunakan Sastrawi dan Stemming
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

def preprocess_text(text):
    # Menghapus stop words
    text_no_stopwords = stopword_remover.remove(text)
    # Stemming
    text_stemmed = stemmer.stem(text_no_stopwords)
    return text_stemmed

# Menggunakan preprocessing (stop words removal dan stemming)
preprocessed_texts = [preprocess_text(text) for text in texts]

In [5]:
# Step 5: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_features=15000,  # Gunakan maksimal 15.000 fitur
    ngram_range=(1, 2),  # Gunakan unigram dan bigram
    stop_words='english'  # Tambahkan stopwords tambahan (optional)
)
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_texts)

# Step 6: Split dataset menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, encoded_labels, test_size=0.2, random_state=42)

# Step 7: Grid Search untuk mencari nilai k terbaik dan metrik jarak terbaik
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # Mencoba berbagai nilai k
    'metric': ['cosine', 'euclidean', 'manhattan']  # Mencoba beberapa metrik
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')  # 5-fold cross-validation
grid_search.fit(X_train, y_train)

# Step 8: Menampilkan parameter terbaik dari Grid Search
print("Best Parameters from Grid Search:", grid_search.best_params_)

Best Parameters from Grid Search: {'metric': 'cosine', 'n_neighbors': 9}


In [6]:
# Step 9: Menggunakan model terbaik dari Grid Search
best_knn_model = grid_search.best_estimator_

# Step 10: Evaluasi model
y_pred = best_knn_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Classification Report:
              precision    recall  f1-score   support

     Abscess       1.00      0.99      0.99       617
      Caries       0.98      1.00      0.99      1400
  Gingivitis       0.99      1.00      0.99      1166
      Normal       0.99      0.86      0.92       420
      Plaque       0.95      0.98      0.96       797

    accuracy                           0.98      4400
   macro avg       0.98      0.97      0.97      4400
weighted avg       0.98      0.98      0.98      4400

Accuracy: 98.07%


In [8]:
import joblib

# Simpan model KNN
joblib.dump(best_knn_model, 'knn_model.pkl')

# Simpan tfidf vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Simpan label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [9]:
# Memuat kembali model, tfidf vectorizer, dan label encoder
knn_model = joblib.load('knn_model.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Verifikasi jika berhasil memuat model
print("Model, TF-IDF Vectorizer, dan Label Encoder berhasil dimuat!")

Model, TF-IDF Vectorizer, dan Label Encoder berhasil dimuat!


In [10]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Preprocessing text
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

def preprocess_text(text):
    text_no_stopwords = stopword_remover.remove(text)
    text_stemmed = stemmer.stem(text_no_stopwords)
    return text_stemmed

# Input text untuk diuji
text = "saya merasa nyeri di bagian belakang gigi"

# Preprocess input text
preprocessed_text = preprocess_text(text)

# Transformasi teks menjadi fitur tf-idf
input_tfidf = tfidf_vectorizer.transform([preprocessed_text])

# Prediksi label
predicted_label = knn_model.predict(input_tfidf)
predicted_class = label_encoder.inverse_transform(predicted_label)

# Output hasil prediksi
print(f"Hasil prediksi untuk teks: '{text}' adalah kelas '{predicted_class[0]}'")

Hasil prediksi untuk teks: 'saya merasa nyeri di bagian belakang gigi' adalah kelas 'Gingivitis'
