In [None]:
import pandas as pd

df_review = pd.read_csv("/content/review.csv")

In [None]:
df_review.tail()

In [None]:
df_review["Star"].value_counts()

In [None]:
label = []
for index, row in df_review.iterrows():
    if row["Star"] == 5:
        label.append(1)
    else:
        label.append(0)

df_review["label"] = label
df_review = df_review.drop(columns=['Star'])
df_review.tail()


In [None]:
df_review['label'].value_counts()

In [None]:
df_preprocessed = df_review.copy()
df_preprocessed = df_preprocessed.drop(columns=['Date', 'Name'])
df_preprocessed.head()

In [None]:
s_1 = df_preprocessed[df_preprocessed['label']==0].sample(25000,replace=True)
s_2 = df_preprocessed[df_preprocessed['label']==1].sample(25000,replace=True)
df_preprocessed = pd.concat([s_1, s_2])

print(df_preprocessed.shape)
print(df_preprocessed['label'].value_counts(normalize=True))

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# Polarity == 0 negative
train_s0 = df_preprocessed[df_preprocessed["label"] == 0]
all_text_s0 = ' '.join(word for word in train_s0["Comment"])
wordcloud = WordCloud(colormap='Reds', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_s0)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

# Polarity == 1 positive
train_s1 = df_preprocessed[df_preprocessed["label"] == 1]
all_text_s1 = ' '.join(word for word in train_s1["Comment"])
wordcloud = WordCloud(width=1000, height=1000, colormap='Blues', background_color='white', mode='RGBA').generate(all_text_s1)
plt.figure( figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

In [None]:
wf=wordcloud.words_

In [None]:
import string, re

def cleansing(data):
    # lower text
    data = data.lower()
    
    # hapus punctuation
    remove = string.punctuation
    translator = str.maketrans(remove, ' '*len(remove))
    data = data.translate(translator)
    
    # remove ASCII dan unicode
    data = data.encode('ascii', 'ignore').decode('utf-8')
    data = re.sub(r'[^\x00-\x7f]',r'', data)
    
    # remove newline
    data = data.replace('\n', ' ')
    
    return data

In [None]:
# jalankan cleansing data
review = []
for index, row in df_preprocessed.iterrows():
    review.append(cleansing(row["Comment"]))
    
df_preprocessed["Comment"] = review
df_preprocessed.head()

In [None]:
!pip install Sastrawi

In [None]:
# import library
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
 
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()
 
# Contoh
kalimat = 'Dengan Menggunakan Python dan Library Sastrawi saya dapat melakukan proses Stopword Removal'
stop = stopword.remove(kalimat)
print(stop)

In [None]:
# lakukan pada data kita

review = []
for index, row in df_preprocessed.iterrows():
    review.append(stopword.remove(row["Comment"]))
    
df_preprocessed["Comment"] = review
df_preprocessed.head()

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# contoh
kalimat = 'Liverpool adalah klub hebat tidak seperti si itu WkwkWK'
katadasar = stemmer.stem(kalimat)
 
print(katadasar)

In [None]:
# implementasi pada data kita
review = []
for index, row in df_preprocessed.iterrows():
    review.append(stemmer.stem(row["Comment"]))
    
df_preprocessed["Comment"] = review
df_preprocessed.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_preprocessed['Comment'], df_preprocessed['label'], 
                                                    test_size=0.1, stratify=df_preprocessed['label'], random_state=30)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'Pusing satu kali',
    'Pusing dua kali',
    'Pusing tiga kali',
    'Pusing lagi']

vectorizer = TfidfVectorizer()

# contoh
X = vectorizer.fit_transform(corpus)
X.toarray()

In [None]:
# implementasi pada dokumen kita
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape)
print(X_test.shape)

In [None]:
def preprocess_data(data):
    # cleansing data
    data = cleansing(data)
    
    # hapus stopwords
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    data = stopword.remove(data)
    
    # stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    data = stemmer.stem(data)
    
    # count vectorizer
    data = vectorizer.transform([data])
    
    return data

In [None]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel="linear")

#cross_val_score(clf, X_train, y_train, cv=10)

In [None]:
# lakukan prediksi pada data test
clf.fit(X_train,y_train)
predict = clf.predict(X_test)

In [None]:
# import library evaluation
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, accuracy_score

In [None]:
# f1_score
print("f1 score hasil prediksi adalah: ")
print(f1_score(y_test, predict))

# accuracy score
print("accuracy score hasil prediksi adalah: ")
print(accuracy_score(y_test, predict))

# precision score
print("precision score hasil prediksi adalah: ")
print(precision_score(y_test, predict))

# recall score
print("recall score hasil prediksi adalah: ")
print(recall_score(y_test, predict))

In [None]:
# confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
tn, fp, fn, tp

In [None]:
review_positif = "kualitas brand milik toko liverpool memang mantap banget pokoknya"
review_negatif = "respon toko tim sebelah kok jelek banget sih"

In [None]:
clf.predict(preprocess_data(review_positif))

In [None]:
clf.predict(preprocess_data(review_negatif))