In [15]:
import os
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


dataset_path = os.path.realpath(os.path.join(os.path.dirname(__name__), '..', 'dataset-50.csv'))

In [None]:
df_comments = pd.read_csv(dataset_path)
df_comments.tail()

In [None]:
df_comments['label'].value_counts()

## Lakukan Preprocessing

In [18]:
stemmer = StemmerFactory().create_stemmer()
stopword_factory = StopWordRemoverFactory()
combined_stopwords = set(stopword_factory.get_stop_words()).union(set(stopwords.words('english')))

def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def preprocess_text(text):
    text = clean_text(text).lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in combined_stopwords]
    stemmed = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed)


df_comments['preprocess'] = df_comments['comment'].apply(preprocess_text)

In [19]:
vectorizer = TfidfVectorizer()

X_train, X_test, y_train, y_test = train_test_split(df_comments['preprocess'], df_comments['label'], test_size=0.2, stratify=df_comments['label'], random_state=0)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# print(X_train)
# print(X_test)

In [20]:
from sklearn.svm import SVC

model = SVC(random_state=0, kernel='rbf')

model.fit(X_train, y_train)
predict = model.predict(X_test)
print(predict)


['positif' 'negatif' 'negatif' 'negatif' 'positif' 'positif' 'negatif'
 'negatif' 'positif' 'negatif']


In [21]:
from sklearn.metrics import classification_report

y_predict = model.predict(X_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

     negatif       0.83      1.00      0.91         5
     positif       1.00      0.80      0.89         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10



In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

# f1-score
print('F1-Score : ', f1_score(y_test, predict, pos_label='positif'))
# Accuracy
print('Accuracy : ', accuracy_score(y_test, predict))
# precision
print('Precision : ', precision_score(y_test, predict, pos_label='positif'))
# recall
print('Recall : ', recall_score(y_test, predict, pos_label='positif'))


# confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
tn, fp, fn, tp


F1-Score :  0.8888888888888888
Accuracy :  0.9
Precision :  1.0
Recall :  0.8


(np.int64(5), np.int64(0), np.int64(1), np.int64(4))

In [23]:
def cleansing(data):
    # lower text
    data = data.lower()
    # remove punctuation
    remove = string.punctuation
    translator = str.maketrans (remove, ' '*len(remove))
    data = data.translate (translator)
    # remove ASCII dan unicode
    data = data.encode('ascii', 'ignore').decode('utf-8')
    data = re.sub(r'[^\x00-\x7f]',r'', data)
    # remove newline
    data = data.replace('\n', '')

    return data

def preprocess_data(data):
    # cleansing data
    data = cleansing(data)
    # remove stopwords
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    data = stopword.remove(data)
    # stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    data = stemmer.stem(data)
    # count vectorizer
    data = vectorizer.transform([data])
    return data

In [24]:
model.predict(preprocess_data('jelek banget kulit kering dan beruntusan, pakek skincare ada rasa gatal dikit'))

array(['negatif'], dtype=object)