In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re

In [14]:
# Preprocessing function (IMDB - English)
def preprocess_text_english(text):
    # Lowercase text
    text = text.lower()
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return ' '.join(tokens)

In [16]:
import nltk

# Contoh dataset IMDB dan ulasan produk Indonesia
imdb_data = pd.read_csv('IMDB_Dataset.csv')

# Preprocessing IMDB
imdb_data['cleaned_review'] = imdb_data['review'].apply(preprocess_text_english)

In [17]:
# Menampilkan beberapa baris pertama dari cleaned_review pada dataset IMDB
print("IMDB Cleaned Reviews:")
print(imdb_data[['review', 'cleaned_review']].head())

IMDB Cleaned Reviews:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one review mention watch oz episod youll hook ...  
1  wonder littl product br br film techniqu unass...  
2  thought wonder way spend time hot summer weeke...  
3  basic there famili littl boy jake think there ...  
4  petter mattei love time money visual stun film...  


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# IMDB dataset vectorization
X = tfidf_vectorizer.fit_transform(imdb_data['cleaned_review'])
y = imdb_data['sentiment']  # Assume sentiment is labeled as 'positive' or 'negative'

# Menampilkan hasil vektorisasi dari dataset IMDB
print("TF-IDF Vectorization (IMDB):")
print(X[:100].toarray())  # Mengubah sparse matrix menjadi array agar bisa dilihat hasilnya
print("Shape of IMDB TF-IDF matrix:", X.shape)

TF-IDF Vectorization (IMDB):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Shape of IMDB TF-IDF matrix: (50000, 137558)


In [22]:
from sklearn.model_selection import train_test_split

# Memisahkan fitur dan label
y = imdb_data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')

Train shape: (37500, 137558), Test shape: (12500, 137558)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

logreg_model = LogisticRegression()  # Inisialisasi model Logistic Regression
logreg_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_logreg = logreg_model.predict(X_test)  # Prediksi data uji

# Tampilkan hasil evaluasi
print("Akurasi Logistic Regression:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

Akurasi Logistic Regression: 0.88968
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      6157
    positive       0.88      0.90      0.89      6343

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_model = MultinomialNB()  # Inisialisasi model Naive Bayes
nb_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_nb = nb_model.predict(X_test)  # Prediksi data uji

# Tampilkan hasil evaluasi
print("Akurasi Naive Bayes :", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Akurasi Naive Bayes : 0.86096
              precision    recall  f1-score   support

    negative       0.84      0.88      0.86      6157
    positive       0.88      0.84      0.86      6343

    accuracy                           0.86     12500
   macro avg       0.86      0.86      0.86     12500
weighted avg       0.86      0.86      0.86     12500



In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

dt_model = DecisionTreeClassifier()  # Inisialisasi model Decision Tree
dt_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_dt = dt_model.predict(X_test)  # Prediksi data uji

# Tampilkan hasil evaluasi
print("Akurasi Decision Tree:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Akurasi Decision Tree: 0.72032
              precision    recall  f1-score   support

    negative       0.71      0.72      0.72      6157
    positive       0.73      0.72      0.72      6343

    accuracy                           0.72     12500
   macro avg       0.72      0.72      0.72     12500
weighted avg       0.72      0.72      0.72     12500



In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_model = RandomForestClassifier()  # Inisialisasi model Random Forest
rf_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_rf = rf_model.predict(X_test)  # Prediksi data uji

# Tampilkan hasil evaluasi
print("Akurasi Random Forest:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Akurasi Random Forest: 0.85056
              precision    recall  f1-score   support

    negative       0.84      0.86      0.85      6157
    positive       0.86      0.84      0.85      6343

    accuracy                           0.85     12500
   macro avg       0.85      0.85      0.85     12500
weighted avg       0.85      0.85      0.85     12500



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm_model = SVC()  # Inisialisasi model SVM
svm_model.fit(X_train, y_train)  # Latih model dengan data pelatihan

y_pred_svm = svm_model.predict(X_test)  # Prediksi data uji

# Tampilkan hasil evaluasi
print("Akurasi SVM:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))