# **Logistic Regression - TFIDF**

In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

df = pd.read_csv("data manual.csv")
X = df["Ulasan_Bersih"]
y = df["Labeling"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF
tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# Model
model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

# Prediksi
y_pred = model.predict(X_test_vec)

# Evaluasi
print("Akurasi :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall   :", recall_score(y_test, y_pred, average='macro'))
print("F1-Score :", f1_score(y_test, y_pred, average='macro'))

Akurasi : 0.7142857142857143
Precision: 0.5005973715651134
Recall   : 0.6296296296296297
F1-Score : 0.5496296296296296


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Logistic Regression - CountVectorizer**

In [111]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

count = CountVectorizer()
X_train_vec = count.fit_transform(X_train)
X_test_vec = count.transform(X_test)

model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall   :", recall_score(y_test, y_pred, average='macro'))
print("F1-Score :", f1_score(y_test, y_pred, average='macro'))

Akurasi : 0.7551020408163265
Precision: 0.8260869565217391
Recall   : 0.6973684210526315
F1-Score : 0.6799845141308555


# **Logistic Regression - Word2Vec**

In [112]:
from gensim.models import Word2Vec
import re
import numpy as np

# Preprocessing dikit
def clean(t):
    t = t.lower()
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    return t.split()
tokens = X.map(clean)

w2v = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1)

def avg_vec(words):
    vecs = [w2v.wv[w] for w in words if w in w2v.wv]
    if len(vecs) == 0:
        return np.zeros(100)
    return np.mean(vecs, axis=0)

X_train_vec = np.vstack([avg_vec(t) for t in X_train])
X_test_vec = np.vstack([avg_vec(t) for t in X_test])

model = LogisticRegression(max_iter=300)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall   :", recall_score(y_test, y_pred, average='macro'))
print("F1-Score :", f1_score(y_test, y_pred, average='macro'))



Akurasi : 0.6122448979591837
Precision: 0.4169590643274854
Recall   : 0.5389863547758285
F1-Score : 0.465526751241037


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Support Vector Machine (SVM) - TFIDF**

In [113]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

df = pd.read_csv("data manual.csv")
X = df["Ulasan_Bersih"]
y = df["Labeling"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

model = SVC(kernel="linear")
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.7142857142857143
Precision: 0.6698595146871008
Recall   : 0.6398635477582846
F1-Score : 0.5939153439153438


# **Support Vector Machine (SVM) - CountVectorizer**

In [114]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
X_train_vec = count.fit_transform(X_train)
X_test_vec = count.transform(X_test)

model = SVC(kernel="linear")
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.6938775510204082
Precision: 0.6525925925925926
Recall   : 0.6437621832358674
F1-Score : 0.6269292548362316


# **Support Vector Machine (SVM) - Word2Vec (Average Embedding)**

In [115]:
from gensim.models import Word2Vec
import numpy as np
import re

def clean(t):
    t = re.sub(r'[^a-z0-9\s]', ' ', t.lower()).split()
    return t

tokens = X.map(clean)
X_train_tok, X_test_tok, y_train, y_test = train_test_split(tokens, y, test_size=0.2, random_state=42)

w2v = Word2Vec(sentences=X_train_tok, vector_size=100, min_count=1)

def avg_vec(words):
    vecs = [w2v.wv[w] for w in words if w in w2v.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)

X_train_vec = np.vstack([avg_vec(t) for t in X_train_tok])
X_test_vec = np.vstack([avg_vec(t) for t in X_test_tok])

model = SVC(kernel="linear")
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.6326530612244898
Precision: 0.5045045045045046
Recall   : 0.5555555555555555
F1-Score : 0.4928571428571429


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Nive Bayes - CountVectorizer**

In [116]:
from sklearn.naive_bayes import MultinomialNB

count = CountVectorizer()
X_train_vec = count.fit_transform(X_train)
X_test_vec = count.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.7346938775510204
Precision: 0.7584033613445378
Recall   : 0.6769005847953217
F1-Score : 0.6660334346504558


# **Nive Bayes - TFIDF**

In [117]:
tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.6938775510204082
Precision: 0.49203431372549017
Recall   : 0.6111111111111112
F1-Score : 0.5340802987861811


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **GaussianNB - Word2Vec**

In [118]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import numpy as np
from gensim.models import Word2Vec
import re

def clean(t):
    t = str(t) # mastiin teks dalam bentuk string
    t = re.sub(r'[^a-z0-9\s]', ' ', t.lower()).split()
    return t

X_train_tok_local = X_train.apply(clean)
X_test_tok_local = X_test.apply(clean)

w2v_local = Word2Vec(sentences=X_train_tok_local, vector_size=100, min_count=1)

def avg_vec_local(words):
    vecs = [w2v_local.wv[w] for w in words if w in w2v_local.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)

X_train_w2v_local = np.vstack([avg_vec_local(t) for t in X_train_tok_local])
X_test_w2v_local = np.vstack([avg_vec_local(t) for t in X_test_tok_local])

gnb = GaussianNB()
gnb.fit(X_train_w2v_local, y_train)

pred = gnb.predict(X_test_w2v_local)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.6122448979591837
Precision: 0.5833333333333334
Recall   : 0.5594541910331384
F1-Score : 0.5386762360446571


# **Random Forest - TFIDF**

In [119]:
from sklearn.ensemble import RandomForestClassifier

tfidf = TfidfVectorizer()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

model = RandomForestClassifier(n_estimators=200)
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.673469387755102
Precision: 0.4568965517241379
Recall   : 0.5964912280701754
F1-Score : 0.5117294053464266


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Random Forest - CountVectorizer**

In [120]:
count = CountVectorizer()
X_train_vec = count.fit_transform(X_train)
X_test_vec = count.transform(X_test)

model = RandomForestClassifier(n_estimators=200)
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.673469387755102
Precision: 0.4568965517241379
Recall   : 0.5964912280701754
F1-Score : 0.5117294053464266


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Random Forest - Word2Vec**

In [121]:
from sklearn.ensemble import RandomForestClassifier

# pake embedding dari model SVM-W2V tadi
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train_vec, y_train)

pred = model.predict(X_test_vec)
print("Akurasi :", accuracy_score(y_test, pred))
print("Precision:", precision_score(y_test, pred, average='macro'))
print("Recall   :", recall_score(y_test, pred, average='macro'))
print("F1-Score :", f1_score(y_test, pred, average='macro'))

Akurasi : 0.6530612244897959
Precision: 0.45251058681185724
Recall   : 0.5789473684210527
F1-Score : 0.5009331840238894
