# *Import Library*

In [1]:
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Memuat Data Ulasan

In [2]:
input_file = "ulasan akhir.xlsx"
df = pd.read_excel(input_file)

df['Ulasan'] = df['Ulasan'].astype(str)

# *Preprocessing*
### *Cleaning* & *Casefolding*

In [3]:
def cleaning_casefolding(ulasan):
    ulasan = ulasan.strip(" ")
    ulasan = re.sub(r'[?|#@$!_:\"\'.,\(\)\[\]\{\}\+\-\/\*\^\%\=\<\>\&\~\`\;0-9]', ' ', ulasan)
    ulasan = re.sub(r'[^\w\s]', ' ', ulasan)
    ulasan = re.sub(r"\b\w\b", "", ulasan)
    ulasan = " ".join(ulasan.split())
    ulasan = ulasan.lower()
    return ulasan
df['cleaning_casefolding'] = df['Ulasan'].apply(cleaning_casefolding)

### Normalisasi

In [4]:
file_slangwords = "kamus alay.csv"
slangwords = pd.read_csv(file_slangwords, encoding='utf-8')

def normalisasi(ulasan):
    for j, f in zip(slangwords['slang'], slangwords['formal']):
        f = str(f) 
        ulasan = re.sub(rf"\b{j}\b", f, ulasan) 
    return ulasan
df['normalisasi'] = df['cleaning_casefolding'].apply(normalisasi)

### *Convert Negation*

In [5]:
def convert_negation(ulasan):
    kata_negasi = ["tidak", "tiada", "belum", "jangan", "tanpa","bukan"]
    for negasi in kata_negasi:
        ulasan= re.sub(rf"\b({negasi})\b (\w+)", rf"\1 \2_neg", ulasan)
    return ulasan
df['convert_negation'] = df['normalisasi'].apply(convert_negation)

### *Tokenizing*

In [6]:
def tokenizing(ulasan):
    list_token = ulasan.split(' ')
    list_token = [token for token in list_token if token != '']
    return list_token
df['tokenizing'] = df['convert_negation'].apply(tokenizing)

### *Stopword Removal*

In [7]:
def stopword_removal(tokens):
    custom_stopwords = pd.read_excel('stopwords-indonesia.xlsx')['stopwords'].tolist()
    tokens_without_stopwords = [token for token in tokens if token not in custom_stopwords]
    return tokens_without_stopwords
df['stopword_removal'] = df['tokenizing'].apply(stopword_removal)

### *Stemming*

In [8]:
def stemming_nazief_andriani(tokens):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stemmed_tokens = []
    for token in tokens:
        parts = token.split('_')
        stemmed_parts = [stemmer.stem(part) for part in parts]
        stemmed_token = '_'.join(stemmed_parts)
        stemmed_tokens.append(stemmed_token)
    return ' '.join(stemmed_tokens)
df['stemming'] = df['stopword_removal'].apply(stemming_nazief_andriani)

# *Tranformation*
### Pembagian Data 80:20

In [9]:
X = df['stemming'].values
y = df['Sentimen'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22, stratify=y)

train_df = pd.DataFrame({'stemming': X_train, 'Sentimen': y_train})
test_df = pd.DataFrame({'stemming': X_test, 'Sentimen': y_test})

### Pembobotan *Term* (TF_IDF)

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(X_train)

normalized_tfidf_matrix_train = normalize(tfidf_matrix_train, norm='l2')

tfidf_matrix_test = tfidf_vectorizer.transform(X_test)

normalized_tfidf_matrix_test = normalize(tfidf_matrix_test, norm='l2')

df_tfidf_train = pd.DataFrame(normalized_tfidf_matrix_train.toarray())
df_tfidf_test = pd.DataFrame(normalized_tfidf_matrix_test.toarray())

# *Data Mining*
### Naive Bayes Tanpa SMOTE

In [11]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(df_tfidf_train, train_df['Sentimen'])
y_pred = naive_bayes_classifier.predict(df_tfidf_test)

In [15]:
output_naive_bayes = pd.DataFrame({
    'Ulasan': test_df['stemming'],  
    'Label_Sebenarnya': test_df['Sentimen'],  
    'Prediksi': y_pred
})
output_naive_bayes.to_excel("output_naive_bayes.xlsx", index=False)

### Naive Bayes Menggunakan SMOTE

In [12]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(df_tfidf_train, train_df['Sentimen'])
naive_bayes_classifier_smote = MultinomialNB()
naive_bayes_classifier_smote.fit(X_train_resampled, y_train_resampled)
y_pred_smote = naive_bayes_classifier_smote.predict(df_tfidf_test)

In [17]:
output_naive_bayes_smote = pd.DataFrame({
    'Ulasan': test_df['stemming'],  
    'Label_Sebenarnya': test_df['Sentimen'],  
    'Prediksi': y_pred_smote 
})
output_naive_bayes_smote.to_excel("output_naive_bayes_smote.xlsx", index=False)

# *Interpretation/Evaluation*

In [13]:
print("===== Data Training =====")
for label in ['negatif', 'netral', 'positif']:
    print(f"{label}: {sum(train_df['Sentimen'] == label)}")
print()

print("===== Data Testing =====")
for label in ['negatif', 'netral', 'positif']:
    print(f"{label}: {sum(test_df['Sentimen'] == label)}")
print()

print(f"Akurasi tanpa menggunakan SMOTE: {accuracy_score(test_df['Sentimen'], y_pred):.2f}")

kelas = ['negatif', 'netral', 'positif']
presisi = precision_score(test_df['Sentimen'], y_pred, labels=kelas, average=None, zero_division=0)
recall = recall_score(test_df['Sentimen'], y_pred, labels=kelas, average=None, zero_division=0)
for i, label in enumerate(kelas):
    print(f"Kelas {label}: Presisi = {presisi[i]:.2f}, Recall = {recall[i]:.2f}")

print(f"\nConfusion Matrix tanpa menggunakan SMOTE:\n{confusion_matrix(test_df['Sentimen'], y_pred)}")
print()

print(f"Akurasi menggunakan SMOTE: {accuracy_score(test_df['Sentimen'], y_pred_smote):.2f}")

kelas = ['negatif', 'netral', 'positif']
presisi_smote = precision_score(test_df['Sentimen'], y_pred_smote, labels=kelas, average=None, zero_division=0)
recall_smote = recall_score(test_df['Sentimen'], y_pred_smote, labels=kelas, average=None, zero_division=0)
for i, label in enumerate(kelas):
    print(f"Kelas {label} setelah SMOTE: Presisi = {presisi_smote[i]:.2f}, Recall = {recall_smote[i]:.2f}")

print(f"\nConfusion Matrix menggunakan SMOTE:\n{confusion_matrix(test_df['Sentimen'], y_pred_smote)}")


===== Data Training =====
negatif: 244
netral: 115
positif: 1063

===== Data Testing =====
negatif: 61
netral: 29
positif: 266

Akurasi tanpa menggunakan SMOTE: 0.83
Kelas negatif: Presisi = 0.91, Recall = 0.52
Kelas netral: Presisi = 0.00, Recall = 0.00
Kelas positif: Presisi = 0.83, Recall = 1.00

Confusion Matrix tanpa menggunakan SMOTE:
[[ 32   0  29]
 [  2   0  27]
 [  1   0 265]]

Akurasi menggunakan SMOTE: 0.84
Kelas negatif setelah SMOTE: Presisi = 0.79, Recall = 0.75
Kelas netral setelah SMOTE: Presisi = 0.29, Recall = 0.52
Kelas positif setelah SMOTE: Presisi = 0.97, Recall = 0.89

Confusion Matrix menggunakan SMOTE:
[[ 46  14   1]
 [  7  15   7]
 [  5  23 238]]
