In [2]:
# Mount Google Drive untuk mengakses file
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
pip install gensim Sastrawi



In [14]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Klasifikasi algorithm dan Word2vec

In [17]:
import joblib
import numpy as np
import pandas as pd
import re
import string
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary
import ipywidgets as widgets
from IPython.display import display

# Memuat model Word2Vec dan classifier yang telah disimpan
word2vec_model_path = '/content/drive/MyDrive/Word2vec/models/word2vec_model.joblib'
model_word2vec = joblib.load(word2vec_model_path)
print(f"Word2Vec model loaded with dimension: {model_word2vec.vector_size}")

models_path = '/content/drive/MyDrive/Word2vec/models/'
classifier_models = {}
classifiers_names = ["Naive Bayes", "SVM", "KNN", "Logistic Regression", "Decision Tree", "Random Forest"]

# Memuat model klasifikasi yang telah disimpan
for model_name in classifiers_names:
    classifier_path = f"{models_path}{model_name}_classifier_model.joblib"
    classifier = joblib.load(classifier_path)
    classifier_models[model_name] = classifier
    print(f"{model_name} classifier loaded.")

# Memuat LabelEncoder yang telah disimpan
label_encoder_path = '/content/drive/MyDrive/Word2vec/models/label_encoder.joblib'
label_encoder = joblib.load(label_encoder_path)
print(f"LabelEncoder loaded from: {label_encoder_path}")

# Load kata_normalisasi_dict dengan memastikan format yang benar
kata_normalisasi_dict = {}
with open('/content/drive/MyDrive/kamus.txt') as kamus:
    word_lines = kamus.readlines()
    for line in word_lines:
        line = line.strip()
        if line:
            parts = line.split()
            if len(parts) >= 2:
                kata_normalisasi_dict[parts[0]] = parts[1]

# Fungsi tokenisasi menggunakan split (tokenisasi berdasarkan spasi)
def word_tokenize_wrapper(text):
    return text.split()  # Tokenisasi sederhana menggunakan split()

# Preprocessing functions (same as in the referenced repo)
def filtering_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove URL
    text = re.sub(r'https?:\/\/\S+', '', text)

    # Remove usernames and hashtags
    text = ' '.join(re.sub(r"([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())

    # Remove unwanted characters
    text = re.sub(r'(b\'{1,2})', "", text)

    # Remove non-alphabet characters
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Remove digits
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def preprocess_text(text):
    filtered_text = filtering_text(text)
    return stop_stem(filtered_text)

def stop_stem(text):
    # Load stopword list from the file
    with open('/content/drive/MyDrive/kamus.txt') as kamus:
        word = kamus.readlines()
        list_stopword = [line.replace('\n', "") for line in word]

    # Remove stopwords using Sastrawi
    dictionary = ArrayDictionary(list_stopword)
    stopword = StopWordRemover(dictionary)
    text = stopword.remove(text)

    # Apply stemming using Sastrawi
    factory_stemmer = StemmerFactory()
    stemmer = factory_stemmer.create_stemmer()
    text = stemmer.stem(text)

    return text

def normalisasi_kata(document, kata_normalisasi_dict):
    return [kata_normalisasi_dict.get(term, term) for term in document]

# Function for vectorizing text
def vectorize_text(text, model_word2vec):
    words = text.split()
    word_vectors = [model_word2vec.wv[word] for word in words if word in model_word2vec.wv.key_to_index]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_word2vec.vector_size)

def preprocess_and_normalize(text, kata_normalisasi_dict):
    processed_text = preprocess_text(text)
    tokenized_text = word_tokenize_wrapper(processed_text)
    normalized_tokens = normalisasi_kata(tokenized_text, kata_normalisasi_dict)
    return ' '.join(normalized_tokens)

# Function for prediction
def predict_class(text, classifier_models, model_word2vec, kata_normalisasi_dict, label_encoder):
    normalized_text = preprocess_and_normalize(text, kata_normalisasi_dict)
    vectorized_text = vectorize_text(normalized_text, model_word2vec).reshape(1, -1)

    results = {}
    for model_name, model in classifier_models.items():
        if hasattr(model, "predict_proba"):
            prob = model.predict_proba(vectorized_text)[0]  # Get probabilities for all classes
            pred = model.predict(vectorized_text)[0]  # Get the predicted class label
            try:
                pred_label = label_encoder.inverse_transform([pred])[0]  # Convert numerical label to original label
            except ValueError as e:
                print(f"Error during inverse_transform: {e}")
                pred_label = "Unknown Label"
            results[model_name] = {'prediction': pred_label, 'probability': prob}
        else:
            pred = model.predict(vectorized_text)[0]
            try:
                pred_label = label_encoder.inverse_transform([pred])[0]  # Convert numerical label to original label
            except ValueError as e:
                print(f"Error during inverse_transform: {e}")
                pred_label = "Unknown Label"
            results[model_name] = {'prediction': pred_label, 'probability': 'N/A'}

    return results

# Interactive input and button to trigger classification
input_text = widgets.Textarea(value='', placeholder='Enter a sentence to classify...', description='Input Text:', layout=widgets.Layout(width='80%', height='100px'))
classify_button = widgets.Button(description="Classify Text", layout=widgets.Layout(width='20%', height='40px'))
output = widgets.Output()
preprocessed_output = widgets.Output()  # Output for preprocessed text

def on_button_click(b):
    with output:
        output.clear_output()
        text = input_text.value
        if text:
            print(f"Classifying: {text}")
            results = predict_class(text, classifier_models, model_word2vec, kata_normalisasi_dict, label_encoder)
            for model_name, result in results.items():
                print(f"\n{model_name} Prediction: {result['prediction']}")
                print(f"Probability: {result['probability']}")

    with preprocessed_output:
        preprocessed_text = preprocess_and_normalize(text, kata_normalisasi_dict)
        print(f"\n----- Original Text -----\n{text}")
        print(f"\n----- Preprocessed Text -----\n{preprocessed_text}")

classify_button.on_click(on_button_click)

# Display widgets
display(input_text, classify_button, preprocessed_output, output)


Word2Vec model loaded with dimension: 100
Naive Bayes classifier loaded.
SVM classifier loaded.
KNN classifier loaded.
Logistic Regression classifier loaded.
Decision Tree classifier loaded.
Random Forest classifier loaded.
LabelEncoder loaded from: /content/drive/MyDrive/Word2vec/models/label_encoder.joblib


Textarea(value='', description='Input Text:', layout=Layout(height='100px', width='80%'), placeholder='Enter a…

Button(description='Classify Text', layout=Layout(height='40px', width='20%'), style=ButtonStyle())

Output()

Output()

Klasifikasi dengan Indobert + Augmentasi

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import ipywidgets as widgets
from IPython.display import display

# 1. Load the saved fine-tuned IndoBERT model
fine_tuned_model_path = '/content/drive/MyDrive/IndoBERT/fine_tuned_indobert1'
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_path)

print(f"Fine-tuned IndoBERT model loaded from {fine_tuned_model_path}")

# 2. Tokenize the input text
def tokenize_text(text, tokenizer):
    return tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )

# 3. Function to make predictions and show results with probabilities
def predict_class(text, model, tokenizer):
    # Tokenize input text
    inputs = tokenize_text(text, tokenizer)

    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = softmax(logits, dim=-1)  # Convert logits to probabilities

    # Get predicted label and probabilities for each class
    pred_id = torch.argmax(probs, dim=-1).item()  # Get the predicted class id
    pred_prob = probs[0, pred_id].item()  # Probability for the predicted class

    # Map the prediction ID back to label
    label_map = {0: "Non-Radikal", 1: "Radikal"}
    prediction = label_map[pred_id]

    # Returning the prediction, confidence, and all class probabilities
    return prediction, pred_prob, probs[0].tolist()

# 4. Function for interactive input and displaying the result
def on_button_click(b):
    with output:
        output.clear_output()  # Clear previous output
        text = input_text.value  # Get the new input text
        if text:
            print(f"Classifying: {text}")
            prediction, pred_prob, probs = predict_class(text, model, tokenizer)
            print(f"Predicted Label: {prediction}")
            print(f"Prediction Confidence: {pred_prob:.4f}")
            print(f"Probabilities: Non-Radikal: {probs[0]:.4f}, Radikal: {probs[1]:.4f}")

# Input text widget
input_text = widgets.Textarea(
    value='',
    placeholder='Enter a sentence to classify...',
    description='Input Text:',
    layout=widgets.Layout(width='80%', height='100px')
)

# Button to trigger prediction
classify_button = widgets.Button(description="Classify Text", layout=widgets.Layout(width='20%', height='40px'))

# Output display for results
output = widgets.Output()

# Link button click to handler function
classify_button.on_click(on_button_click)

# Display widgets
display(input_text, classify_button, output)


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Fine-tuned IndoBERT model loaded from /content/drive/MyDrive/IndoBERT/fine_tuned_indobert1


Textarea(value='', description='Input Text:', layout=Layout(height='100px', width='80%'), placeholder='Enter a…

Button(description='Classify Text', layout=Layout(height='40px', width='20%'), style=ButtonStyle())

Output()