In [1]:
import joblib
from gensim.models import Word2Vec
import numpy as np
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# --- 1. BUAT ULANG FUNGSI PREPROCESSING ---
# Sangat penting agar fungsi-fungsi ini identik dengan yang digunakan saat pelatihan.

def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'RT[\s]', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip(' ')
    return text

def casefoldingText(text):
    return text.lower()

slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal", "jg": "juga", "dgn": "dengan", "ga": "tidak"}
def fix_slangwords(text):
    words = text.split()
    fixed_words = [slangwords.get(word.lower(), word) for word in words]
    return ' '.join(fixed_words)

def tokenizingText(text):
    return word_tokenize(text)

listStopwords = set(stopwords.words('indonesian'))
listStopwords1 = set(stopwords.words('english'))
listStopwords.update(listStopwords1)
listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
def filteringText(text):
    return [word for word in text if word not in listStopwords]

def toSentence(list_words):
    return ' '.join(list_words)

def review_to_vector(review_tokens, model):
    vectors = [model.wv[word] for word in review_tokens if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# --- 2. MUAT MODEL DAN VECTORIZER ---
try:
    # Skema 1
    svm_model = joblib.load('models/svm_model_tfidf_8020.pkl')
    tfidf_vectorizer_8020 = joblib.load('models/tfidf_vectorizer_8020.pkl')

    # Skema 2
    rf_model_w2v = joblib.load('models/rf_model_w2v_8020.pkl')
    w2v_model = Word2Vec.load('models/w2v_model_8020.word2vec')

    # Skema 3
    rf_model_tfidf = joblib.load('models/rf_model_tfidf_7030.pkl')
    tfidf_vectorizer_7030 = joblib.load('models/tfidf_vectorizer_7030.pkl')
    
    print("Semua model dan vectorizer berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Gagal memuat model: {e}")
    print("Pastikan Anda telah menjalankan skrip penyimpanan di notebook pelatihan.")

# --- 3. PIPELINE INFERENSI ---
def predict_sentiments(review_text):
    """
    Menerima teks ulasan mentah dan memprediksi sentimen 
    menggunakan tiga model yang telah dilatih.
    """
    # Pipeline Preprocessing
    cleaned_text = cleaningText(review_text)
    cased_text = casefoldingText(cleaned_text)
    slang_fixed_text = fix_slangwords(cased_text)
    tokenized_text = tokenizingText(slang_fixed_text)
    filtered_tokens = filteringText(tokenized_text)
    final_text_string = toSentence(filtered_tokens)

    print("-" * 50)
    print(f"Ulasan Asli: '{review_text}'")
    print(f"Teks Diproses: '{final_text_string}'")
    print("-" * 50)

    # Prediksi menggunakan Skema 1 (SVM + TF-IDF 80/20)
    vec_tfidf_8020 = tfidf_vectorizer_8020.transform([final_text_string])
    pred_svm = svm_model.predict(vec_tfidf_8020)[0]
    print(f"1. Prediksi SVM (TF-IDF 80/20)    : {pred_svm.upper()}")

    # Prediksi menggunakan Skema 2 (RF + Word2Vec 80/20)
    vec_w2v = review_to_vector(filtered_tokens, w2v_model).reshape(1, -1)
    pred_rf_w2v = rf_model_w2v.predict(vec_w2v)[0]
    print(f"2. Prediksi RF (Word2Vec 80/20)   : {pred_rf_w2v.upper()}")

    # Prediksi menggunakan Skema 3 (RF + TF-IDF 70/30)
    vec_tfidf_7030 = tfidf_vectorizer_7030.transform([final_text_string])
    pred_rf_tfidf = rf_model_tfidf.predict(vec_tfidf_7030)[0]
    print(f"3. Prediksi RF (TF-IDF 70/30)     : {pred_rf_tfidf.upper()}")
    print("\n")


# --- 4. JALANKAN INFERENSI PADA DATA BARU ---
new_reviews = [
    "Aplikasi ini sangat membantu dan mudah digunakan. Saya suka sekali!",
    "Sering error dan lambat, tolong diperbaiki secepatnya. Sangat mengecewakan.",
    "Aplikasinya biasa saja, tidak ada yang spesial tapi berfungsi."
]

for review in new_reviews:
    predict_sentiments(review)

Semua model dan vectorizer berhasil dimuat.
--------------------------------------------------
Ulasan Asli: 'Aplikasi ini sangat membantu dan mudah digunakan. Saya suka sekali!'
Teks Diproses: 'aplikasi membantu mudah suka'
--------------------------------------------------
1. Prediksi SVM (TF-IDF 80/20)    : POSITIVE
2. Prediksi RF (Word2Vec 80/20)   : POSITIVE
3. Prediksi RF (TF-IDF 70/30)     : POSITIVE


--------------------------------------------------
Ulasan Asli: 'Sering error dan lambat, tolong diperbaiki secepatnya. Sangat mengecewakan.'
Teks Diproses: 'error lambat tolong diperbaiki secepatnya mengecewakan'
--------------------------------------------------
1. Prediksi SVM (TF-IDF 80/20)    : POSITIVE
2. Prediksi RF (Word2Vec 80/20)   : POSITIVE
3. Prediksi RF (TF-IDF 70/30)     : POSITIVE


--------------------------------------------------
Ulasan Asli: 'Aplikasinya biasa saja, tidak ada yang spesial tapi berfungsi.'
Teks Diproses: 'aplikasinya spesial berfungsi'
----------