In [1]:
import pandas as pd
import requests
import json
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [6]:
def formalizer(string):
    req = requests.post("http://127.0.0.1:9000/formalizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print(response)
        return None
    
def stemmer(string):
    req = requests.post("http://127.0.0.1:9000/stemmer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print(response)
        return None
    
def tokenizer(string):
    req = requests.post("http://127.0.0.1:9000/sentence/tokenizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print(response)
        return None

In [2]:
data1 = pd.read_csv('label/review1.csv')
data2 = pd.read_csv('label/review5.csv')
data3 = pd.read_csv('label/review1_2.csv')
data4 = pd.read_csv('label/review2_2.csv')
data5 = pd.read_csv('label/review3_2.csv')
data6 = pd.read_csv('label/review5_2.csv')

datal = [data1, data2, data3, data4, data5, data6]
data = pd.concat(datal)
data = data[['review', 'produk', 'packaging', 'pengiriman', 'general']].values.tolist()

## Getting aspect

In [5]:
data_aspect = []
for d in data:
    d_aspect = [d[0]] + [1 if aspect != 0 else 0 for aspect in d[1:]]
    data_aspect.append(d_aspect)

In [7]:
preprocessed_data = []
for d in data:
    formalized_data = formalizer(d[0])
    stemmed_data = stemmer(formalized_data)
    preprocessed_data.append(stemmed_data)

In [8]:
y = [d[1:] for d in data_aspect]

train_x, test_x, train_y, test_y = train_test_split(preprocessed_data, y, test_size= 0.2)

In [9]:
pipeline = Pipeline([
                ('vectorize', CountVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

           0       0.74      0.67      0.70        21
           1       0.84      0.88      0.86        43

   micro avg       0.81      0.81      0.81        64
   macro avg       0.79      0.78      0.78        64
weighted avg       0.81      0.81      0.81        64

Classification report for packaging
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        46
           1       0.78      0.78      0.78        18

   micro avg       0.88      0.88      0.88        64
   macro avg       0.85      0.85      0.85        64
weighted avg       0.88      0.88      0.88        64

Classification report for pengiriman
              precision    recall  f1-score   support

           0       0.82      0.86      0.84        37
           1       0.80      0.74      0.77        27

   micro avg       0.81      0.81      0.81        64
   macro avg       0.8

In [10]:
vectorizer = CountVectorizer(min_df= 1)
X = vectorizer.fit_transform(preprocessed_data).toarray()
analyzer = vectorizer.build_analyzer()

tokenized = [list(map(lambda x: vectorizer.vocabulary_.get(x), analyzer(line))) for line in preprocessed_data]

y = [d[1:] for d in data_aspect]

train_x, test_x, train_y, test_y = train_test_split(tokenized, y, test_size= 0.2)

In [11]:
from keras.preprocessing import sequence

max_words = 40
train_x = sequence.pad_sequences(train_x, maxlen= max_words)
test_x = sequence.pad_sequences(test_x, maxlen= max_words)

Using TensorFlow backend.


In [17]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils.np_utils import to_categorical

embedding_size = 64
model = Sequential()
model.add(Embedding(len(vectorizer.vocabulary_), embedding_size, input_length= max_words))
model.add(LSTM(128))
model.add(Dense(1, activation= 'sigmoid'))

model.compile(loss= 'binary_crossentropy', optimizer= 'adam', metrics= ['accuracy'])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    model.fit(train_x, [t[i] for t in train_y], epochs= 16, verbose= 0)
    scores = model.evaluate(test_x, [t[i] for t in test_y])
    print("Accuracy report for {}: {}".format(categories, scores[1]))

Accuracy report for produk: 0.78125
Accuracy report for packaging: 0.6875
Accuracy report for pengiriman: 0.453125
Accuracy report for general: 0.984375


## Polarization aspect for 'produk'

In [42]:
data_x = []
data_y = []
i = 1
for d, s in zip(preprocessed_data, data):
    if (s[i] != 0):
        data_x.append(d)
        data_y.append(1 if s[i] == 1 else 0)
        
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size= 0.2)

In [43]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

pipeline.fit(train_x, train_y)
prediction = pipeline.predict(test_x)
print("Classification report for {}".format(categories))
report = classification_report(test_y, prediction)
print(report)

Classification report for general
              precision    recall  f1-score   support

           0       1.00      0.67      0.80        15
           1       0.84      1.00      0.92        27

   micro avg       0.88      0.88      0.88        42
   macro avg       0.92      0.83      0.86        42
weighted avg       0.90      0.88      0.87        42



## Polarization aspect for 'packaging'

In [44]:
data_x = []
data_y = []
i = 2
for d, s in zip(preprocessed_data, data):
    if (s[i] != 0):
        data_x.append(d)
        data_y.append(1 if s[i] == 1 else 0)
        
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size= 0.2)

In [46]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

pipeline.fit(train_x, train_y)
prediction = pipeline.predict(test_x)
print("Classification report for {}".format(categories))
report = classification_report(test_y, prediction)
print(report)

Classification report for general
              precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.86      0.92      0.89        13

   micro avg       0.86      0.86      0.86        21
   macro avg       0.86      0.84      0.84        21
weighted avg       0.86      0.86      0.86        21



## Polarization aspect for 'pengiriman'

In [47]:
data_x = []
data_y = []
i = 3
for d, s in zip(preprocessed_data, data):
    if (s[i] != 0):
        data_x.append(d)
        data_y.append(1 if s[i] == 1 else 0)
        
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size= 0.2)

In [48]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

pipeline.fit(train_x, train_y)
prediction = pipeline.predict(test_x)
print("Classification report for {}".format(categories))
report = classification_report(test_y, prediction)
print(report)

Classification report for general
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.88      1.00      0.94        15

   micro avg       0.93      0.93      0.93        28
   macro avg       0.94      0.92      0.93        28
weighted avg       0.94      0.93      0.93        28

