In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

In [3]:
# Preprocessing function (IMDB - English)
def preprocess_text_english(text):
    # Lowercase text
    text = text.lower()
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('indonesian'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

In [7]:
indonesia_data = pd.read_csv('reviews.csv')
# Preprocessing Indonesian dataset
indonesia_data['cleaned_review'] = indonesia_data['reviews'].apply(preprocess_text_english)

In [47]:
indonesia_data

Unnamed: 0,reviews,label,cleaned_review
0,kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisss...,1.0,kemeja nya bagusss bgttttaaaa mauuu nngisssssk...
1,"Jahitannya sih rapi,cuman ada benang yang ikut...",0.0,jahitannya sih rapicuman benang jahit jelek
2,Sesuai harga. Agak tipis tapi masih oke kok. W...,0.0,sesuai harga tipi oke warnanya abu kalo difoto...
3,"Wah gila sihhh sebagus itu, se worth it, se l...",1.0,gila sihhh sebagu worth it lembut bajunya kira...
4,Kain nya bagus halus \nTapi kok di bukak koto...,0.0,kain nya bagu halu bukak kotor ya warna putih
...,...,...,...
826,Terima kasih barang sudah sampai sesuai ukuran...,1.0,terima kasih barang sesuai ukuran seesuai gamb...
827,Mantapp realpicttt bangttt tapi pengemasan nya...,1.0,mantapp realpicttt bangttt pengemasan nya cuma...
828,"Suka bgt sama tasnya, ga kayak tas local. Kere...",1.0,suka bgt tasnya ga kayak ta local keren parah ...
829,kualitas produk sangat baik. produk original. ...,1.0,kualita produk produk origin harga produk


In [9]:
# Menampilkan beberapa baris pertama dari cleaned_review pada dataset Indonesia
print("\nIndonesian Cleaned Reviews:")
print(indonesia_data[['reviews', 'cleaned_review']].head())


Indonesian Cleaned Reviews:
                                             reviews  \
0  kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisss...   
1  Jahitannya sih rapi,cuman ada benang yang ikut...   
2  Sesuai harga. Agak tipis tapi masih oke kok. W...   
3  Wah gila sihhh sebagus itu, se worth it, se  l...   
4  Kain nya bagus halus  \nTapi kok di bukak koto...   

                                      cleaned_review  
0  kemeja nya bagusss bgttttaaaa mauuu nngisssssk...  
1        jahitannya sih rapicuman benang jahit jelek  
2  sesuai harga tipi oke warnanya abu kalo difoto...  
3  gila sihhh sebagu worth it lembut bajunya kira...  
4      kain nya bagu halu bukak kotor ya warna putih  


In [19]:
# Vectorization: TF-IDF Vectorizer for both datasets
tfidf_vectorizer = TfidfVectorizer()

# Indonesian dataset vectorization
X = tfidf_vectorizer.fit_transform(indonesia_data['cleaned_review'])
y = indonesia_data['label']  # Assume sentiment is labeled as 'positive' or 'negative'

# Menampilkan hasil vektorisasi dari dataset Indonesian
print("\nTF-IDF Vectorization (Indonesian Reviews):")
print(X.toarray())  # Mengubah sparse matrix menjadi array agar bisa dilihat hasilnya
print("\nShape of Indonesian TF-IDF matrix:", X.shape)  # Menampilkan ukuran matrix (jumlah dokumen, jumlah fitur)


TF-IDF Vectorization (Indonesian Reviews):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Shape of Indonesian TF-IDF matrix: (831, 2690)


In [21]:
from sklearn.model_selection import train_test_split

# Memisahkan fitur dan label
y = indonesia_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')

Train shape: (623, 2690), Test shape: (208, 2690)


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

logreg_model = LogisticRegression()  # Inisialisasi model Logistic Regression
logreg_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_logreg = logreg_model.predict(X_test)  # Prediksi data uji

# Create a mapping for labels
label_mapping = {0.0: 'negative', 1.0: 'positive'}

# Update y_test and y_pred to use string labels
y_test_mapped = [label_mapping[label] for label in y_test]
y_pred_mapped = [label_mapping[label] for label in y_pred_logreg]

# Tampilkan hasil evaluasi
print("Akurasi Logistic Regression :", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_logreg, target_names=["negative", "positive"]))

Akurasi Logistic Regression : 0.9038461538461539
              precision    recall  f1-score   support

    negative       0.82      0.99      0.90       108
    positive       0.99      0.77      0.87       100

    accuracy                           0.88       208
   macro avg       0.91      0.88      0.88       208
weighted avg       0.90      0.88      0.88       208



In [37]:
from sklearn.naive_bayes import MultinomialNB

# Inisialisasi model Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_nb = nb_model.predict(X_test)  # Prediksi data uji

# Create a mapping for labels
label_mapping = {0.0: 'negative', 1.0: 'positive'}

# Update y_test and y_pred to use string labels
y_test_mapped_nb = [label_mapping[label] for label in y_test]
y_pred_mapped_nb = [label_mapping[label] for label in y_pred_nb]

# Tampilkan hasil evaluasi
print("Akurasi Naive Bayes:", accuracy_score(y_test_mapped_nb, y_pred_mapped_nb))
print(classification_report(y_test_mapped_nb, y_pred_mapped_nb, target_names=["negative", "positive"]))

Akurasi Naive Bayes: 0.9038461538461539
              precision    recall  f1-score   support

    negative       0.87      0.96      0.91       108
    positive       0.95      0.84      0.89       100

    accuracy                           0.90       208
   macro avg       0.91      0.90      0.90       208
weighted avg       0.91      0.90      0.90       208



In [39]:
from sklearn.tree import DecisionTreeClassifier

# Inisialisasi model Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_dt = dt_model.predict(X_test)  # Prediksi data uji

# Create a mapping for labels
label_mapping = {0.0: 'negative', 1.0: 'positive'}

# Update y_test and y_pred to use string labels
y_test_mapped_dt = [label_mapping[label] for label in y_test]
y_pred_mapped_dt = [label_mapping[label] for label in y_pred_dt]

# Tampilkan hasil evaluasi
print("Akurasi Decision Tree:", accuracy_score(y_test_mapped_dt, y_pred_mapped_dt))
print(classification_report(y_test_mapped_dt, y_pred_mapped_dt, target_names=["negative", "positive"]))


Akurasi Decision Tree: 0.7788461538461539
              precision    recall  f1-score   support

    negative       0.76      0.84      0.80       108
    positive       0.81      0.71      0.76       100

    accuracy                           0.78       208
   macro avg       0.78      0.78      0.78       208
weighted avg       0.78      0.78      0.78       208



In [41]:
from sklearn.ensemble import RandomForestClassifier

# Inisialisasi model Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_rf = rf_model.predict(X_test)  # Prediksi data uji

# Create a mapping for labels
label_mapping = {0.0: 'negative', 1.0: 'positive'}

# Update y_test and y_pred to use string labels
y_test_mapped_rf = [label_mapping[label] for label in y_test]
y_pred_mapped_rf = [label_mapping[label] for label in y_pred_rf]

# Tampilkan hasil evaluasi
print("Akurasi Random Forest:", accuracy_score(y_test_mapped_rf, y_pred_mapped_rf))
print(classification_report(y_test_mapped_rf, y_pred_mapped_rf, target_names=["negative", "positive"]))

Akurasi Random Forest: 0.8557692307692307
              precision    recall  f1-score   support

    negative       0.80      0.95      0.87       108
    positive       0.94      0.75      0.83       100

    accuracy                           0.86       208
   macro avg       0.87      0.85      0.85       208
weighted avg       0.87      0.86      0.85       208



In [43]:
from sklearn.svm import SVC

# Inisialisasi model SVM
svm_model = SVC()
svm_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_svm = svm_model.predict(X_test)  # Prediksi data uji

# Create a mapping for labels
label_mapping = {0.0: 'negative', 1.0: 'positive'}

# Update y_test and y_pred to use string labels
y_test_mapped_svm = [label_mapping[label] for label in y_test]
y_pred_mapped_svm = [label_mapping[label] for label in y_pred_svm]

# Tampilkan hasil evaluasi
print("Akurasi SVM:", accuracy_score(y_test_mapped_svm, y_pred_mapped_svm))
print(classification_report(y_test_mapped_svm, y_pred_mapped_svm, target_names=["negative", "positive"]))

Akurasi SVM: 0.8798076923076923
              precision    recall  f1-score   support

    negative       0.82      0.99      0.90       108
    positive       0.99      0.76      0.86       100

    accuracy                           0.88       208
   macro avg       0.90      0.88      0.88       208
weighted avg       0.90      0.88      0.88       208

