In [1]:
import pandas as pd
import requests
import json
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [2]:
def formalizer(string):
    req = requests.post("http://127.0.0.1:9000/formalizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print(response)
        return None
    
def stemmer(string):
    req = requests.post("http://127.0.0.1:9000/stemmer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print(response)
        return None
    
def tokenizer(string):
    req = requests.post("http://127.0.0.1:9000/sentence/tokenizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print(response)
        return None

In [3]:
data1 = pd.read_csv('label/review1.csv')
data2 = pd.read_csv('label/review5.csv')
data3 = pd.read_csv('label/review1_2.csv')
data4 = pd.read_csv('label/review2_2.csv')
data5 = pd.read_csv('label/review3_2.csv')
data6 = pd.read_csv('label/review5_2.csv')

datal = [data1, data2, data3, data4, data5, data6]
data = pd.concat(datal)
data = data[['review', 'produk', 'packaging', 'pengiriman', 'general']].values.tolist()

In [4]:
preprocessed_data = []
for d in data:
    formalized_data = formalizer(d[0])
    stemmed_data = stemmer(formalized_data)
    preprocessed_data.append(stemmed_data)

In [5]:
y = [d[1:] for d in data]

train_x, test_x, train_y, test_y = train_test_split(preprocessed_data, y, test_size= 0.2)

In [12]:
vectorizer = CountVectorizer(min_df= 1)
X = vectorizer.fit_transform(preprocessed_data).toarray()
analyzer = vectorizer.build_analyzer()

tokenized = [list(map(lambda x: vectorizer.vocabulary_.get(x), analyzer(line))) for line in preprocessed_data]

y = [d[1:] for d in data]

train_x, test_x, train_y, test_y = train_test_split(tokenized, y, test_size= 0.2)

In [17]:
len(vectorizer.vocabulary_)

1145

In [15]:
from keras.preprocessing import sequence

max_words = 40
train_x = sequence.pad_sequences(train_x, maxlen= max_words)
test_x = sequence.pad_sequences(test_x, maxlen= max_words)

In [34]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils.np_utils import to_categorical

embedding_size = 32
model = Sequential()
model.add(Embedding(len(vectorizer.vocabulary_), embedding_size, input_length= max_words))
model.add(LSTM(64))
model.add(Dense(3, activation= 'sigmoid'))

model.compile(loss= 'categorical_crossentropy', optimizer= 'adam', metrics= ['accuracy'])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    model.fit(train_x, to_categorical([t[i] for t in train_y], num_classes= 3), epochs= 16, verbose= 0)
    scores = model.evaluate(test_x, to_categorical([t[i] for t in test_y], num_classes= 3))
    print("Accuracy report for {}: {}".format(categories, scores[1]))

Accuracy report for produk: 0.609375
Accuracy report for packaging: 0.625
Accuracy report for pengiriman: 0.6875
Accuracy report for general: 0.953125


In [7]:
pipeline = Pipeline([
                ('vectorize', CountVectorizer(ngram_range= (1,2))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter= 300), n_jobs=1)),
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

          -1       0.69      0.69      0.69        13
           0       0.62      0.67      0.64        24
           1       0.72      0.67      0.69        27

   micro avg       0.67      0.67      0.67        64
   macro avg       0.68      0.68      0.67        64
weighted avg       0.68      0.67      0.67        64

Classification report for packaging
              precision    recall  f1-score   support

          -1       1.00      0.25      0.40        12
           0       0.71      1.00      0.83        40
           1       1.00      0.42      0.59        12

   micro avg       0.75      0.75      0.75        64
   macro avg       0.90      0.56      0.61        64
weighted avg       0.82      0.75      0.71        64

Classification report for pengiriman
              precision    recall  f1-score   support

          -1       1.00      0.31      0.47        13
           0       0.69

  'precision', 'predicted', average, warn_for)


In [8]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter= 300), n_jobs=1)),
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

          -1       0.69      0.69      0.69        13
           0       0.72      0.54      0.62        24
           1       0.67      0.81      0.73        27

   micro avg       0.69      0.69      0.69        64
   macro avg       0.69      0.68      0.68        64
weighted avg       0.69      0.69      0.68        64

Classification report for packaging
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        12
           0       0.62      1.00      0.77        40
           1       0.00      0.00      0.00        12

   micro avg       0.62      0.62      0.62        64
   macro avg       0.21      0.33      0.26        64
weighted avg       0.39      0.62      0.48        64

Classification report for pengiriman
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        13
           0       0.54

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [9]:
pipeline = Pipeline([
                ('vectorize', CountVectorizer(ngram_range= (1,2))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

          -1       0.75      0.69      0.72        13
           0       0.69      0.75      0.72        24
           1       0.73      0.70      0.72        27

   micro avg       0.72      0.72      0.72        64
   macro avg       0.72      0.72      0.72        64
weighted avg       0.72      0.72      0.72        64

Classification report for packaging
              precision    recall  f1-score   support

          -1       1.00      0.25      0.40        12
           0       0.73      1.00      0.84        40
           1       1.00      0.50      0.67        12

   micro avg       0.77      0.77      0.77        64
   macro avg       0.91      0.58      0.64        64
weighted avg       0.83      0.77      0.73        64

Classification report for pengiriman
              precision    recall  f1-score   support

          -1       1.00      0.54      0.70        13
           0       0.75

  'precision', 'predicted', average, warn_for)


In [10]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

          -1       0.63      0.92      0.75        13
           0       0.71      0.62      0.67        24
           1       0.75      0.67      0.71        27

   micro avg       0.70      0.70      0.70        64
   macro avg       0.70      0.74      0.71        64
weighted avg       0.71      0.70      0.70        64

Classification report for packaging
              precision    recall  f1-score   support

          -1       1.00      0.25      0.40        12
           0       0.73      1.00      0.84        40
           1       1.00      0.50      0.67        12

   micro avg       0.77      0.77      0.77        64
   macro avg       0.91      0.58      0.64        64
weighted avg       0.83      0.77      0.73        64

Classification report for pengiriman
              precision    recall  f1-score   support

          -1       0.83      0.38      0.53        13
           0       0.71

  'precision', 'predicted', average, warn_for)


In [11]:
pipeline = Pipeline([
                ('vectorize', CountVectorizer(ngram_range= (1,2))),
                ('clf', OneVsRestClassifier(MultinomialNB(), n_jobs=1)),
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

          -1       0.61      0.85      0.71        13
           0       0.71      0.62      0.67        24
           1       0.80      0.74      0.77        27

   micro avg       0.72      0.72      0.72        64
   macro avg       0.71      0.74      0.72        64
weighted avg       0.73      0.72      0.72        64

Classification report for packaging
              precision    recall  f1-score   support

          -1       1.00      0.33      0.50        12
           0       0.74      1.00      0.85        40
           1       1.00      0.50      0.67        12

   micro avg       0.78      0.78      0.78        64
   macro avg       0.91      0.61      0.67        64
weighted avg       0.84      0.78      0.75        64

Classification report for pengiriman
              precision    recall  f1-score   support

          -1       0.80      0.31      0.44        13
           0       0.65

  'precision', 'predicted', average, warn_for)


In [12]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

          -1       0.63      0.92      0.75        13
           0       0.71      0.62      0.67        24
           1       0.75      0.67      0.71        27

   micro avg       0.70      0.70      0.70        64
   macro avg       0.70      0.74      0.71        64
weighted avg       0.71      0.70      0.70        64

Classification report for packaging
              precision    recall  f1-score   support

          -1       1.00      0.25      0.40        12
           0       0.73      1.00      0.84        40
           1       1.00      0.50      0.67        12

   micro avg       0.77      0.77      0.77        64
   macro avg       0.91      0.58      0.64        64
weighted avg       0.83      0.77      0.73        64

Classification report for pengiriman
              precision    recall  f1-score   support

          -1       0.83      0.38      0.53        13
           0       0.71

  'precision', 'predicted', average, warn_for)
