In [1]:
import pandas as pd
import requests
import json
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib

In [48]:
def formalizer(string):
    req = requests.post("http://127.0.0.1:9000/formalizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print('formalizer ' + str(response) + str(string))
        return None
    
def stemmer(string):
    req = requests.post("http://127.0.0.1:9000/stemmer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print('stemmer ' + str(response))
        return None
    
def stopwords_removal(string):
    req = requests.post("http://127.0.0.1:9000/stopwords", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print('stopwords ' + str(response))
        return None
    
def tokenizer(string):
    req = requests.post("http://127.0.0.1:9000/sentence/tokenizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print(response)
        return None

In [122]:
data1 = pd.read_csv('label/review1.csv')
data2 = pd.read_csv('label/review5.csv')
data3 = pd.read_csv('label/review1_2.csv')
data4 = pd.read_csv('label/review2_2.csv')
data5 = pd.read_csv('label/review3_2.csv')
data6 = pd.read_csv('label/review5_2.csv')
data7 = pd.read_csv('label/review1_3.csv')
data8 = pd.read_csv('label/review2_3.csv')
data9 = pd.read_csv('label/review3_3.csv')
data10 = pd.read_csv('label/review5_3.csv')

datal = [data1, data2, data3, data4, data5, data6, data7, data8, data9, data10]
data = pd.concat(datal)
data = data[['review', 'produk', 'packaging', 'pengiriman', 'general']].values.tolist()

## Getting aspect

In [123]:
data_aspect = []
for d in data:
    d_aspect = [d[0]] + [1 if aspect != 0 else 0 for aspect in d[1:]]
    data_aspect.append(d_aspect)

In [124]:
preprocessed_data = []
for d in data:
    formalized_data = formalizer(d[0])
    removed_data = stopwords_removal(formalized_data)
    stemmed_data = stemmer(removed_data)
    preprocessed_data.append(stemmed_data)

### Using SVC

In [133]:
y = [d[1:] for d in data_aspect]

train_x, test_x, train_y, test_y = train_test_split(preprocessed_data, y, test_size= 0.1)

In [134]:
pipeline = Pipeline([
                ('vectorize', CountVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)
    joblib.dump(pipeline, categories + "_aspect.sav")

Classification report for produk
              precision    recall  f1-score   support

           0       0.83      0.56      0.67        18
           1       0.83      0.95      0.88        40

   micro avg       0.83      0.83      0.83        58
   macro avg       0.83      0.75      0.78        58
weighted avg       0.83      0.83      0.82        58

Classification report for packaging
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        43
           1       0.83      0.67      0.74        15

   micro avg       0.88      0.88      0.88        58
   macro avg       0.86      0.81      0.83        58
weighted avg       0.88      0.88      0.87        58

Classification report for pengiriman
              precision    recall  f1-score   support

           0       0.88      0.91      0.89        32
           1       0.88      0.85      0.86        26

   micro avg       0.88      0.88      0.88        58
   macro avg       0.8

  'precision', 'predicted', average, warn_for)


### Using LSTM

In [80]:
vectorizer = CountVectorizer(min_df= 1)
X = vectorizer.fit_transform(preprocessed_data).toarray()
analyzer = vectorizer.build_analyzer()

tokenized = [list(map(lambda x: vectorizer.vocabulary_.get(x), analyzer(line))) for line in preprocessed_data]

y = [d[1:] for d in data_aspect]

train_x, test_x, train_y, test_y = train_test_split(tokenized, y, test_size= 0.2)

In [81]:
from keras.preprocessing import sequence

max_words = 40
train_x = sequence.pad_sequences(train_x, maxlen= max_words)
test_x = sequence.pad_sequences(test_x, maxlen= max_words)

In [82]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils.np_utils import to_categorical

embedding_size = 64
model = Sequential()
model.add(Embedding(len(vectorizer.vocabulary_), embedding_size, input_length= max_words))
model.add(LSTM(128))
model.add(Dense(1, activation= 'sigmoid'))

model.compile(loss= 'binary_crossentropy', optimizer= 'adam', metrics= ['accuracy'])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    model.fit(train_x, [t[i] for t in train_y], epochs= 16, verbose= 0)
    scores = model.evaluate(test_x, [t[i] for t in test_y])
    print("Accuracy report for {}: {}".format(categories, scores[1]))

Accuracy report for produk: 0.6608695610709813
Accuracy report for packaging: 0.643478260351264
Accuracy report for pengiriman: 0.5739130429599596
Accuracy report for general: 0.756521737575531


### Using Doc2Vec + SVC 

In [18]:
y = [d[1:] for d in data_aspect]

train_x, test_x, train_y, test_y = train_test_split(preprocessed_data, y, test_size= 0.1)

In [19]:
pipeline = Pipeline([
                ('clf', LogisticRegression())
            ])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    pipeline.fit(train_x, [t[i] for t in train_y])
    prediction = pipeline.predict(test_x)
    print("Classification report for {}".format(categories))
    report = classification_report([t[i] for t in test_y], prediction)
    print(report)

Classification report for produk
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.56      1.00      0.72        18

   micro avg       0.56      0.56      0.56        32
   macro avg       0.28      0.50      0.36        32
weighted avg       0.32      0.56      0.40        32

Classification report for packaging
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        23
           1       0.00      0.00      0.00         9

   micro avg       0.72      0.72      0.72        32
   macro avg       0.36      0.50      0.42        32
weighted avg       0.52      0.72      0.60        32

Classification report for pengiriman
              precision    recall  f1-score   support

           0       0.55      0.94      0.70        17
           1       0.67      0.13      0.22        15

   micro avg       0.56      0.56      0.56        32
   macro avg       0.6

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils.np_utils import to_categorical

vocab_size, embedding_size = docvec_weight.shape
model = Sequential()
model.add(Embedding(input_dim= vocab_size, output_dim= embedding_size, weights= [docvec_weight]))
model.add(LSTM(128))
model.add(Dense(1, activation= 'sigmoid'))

model.compile(loss= 'binary_crossentropy', optimizer= 'adam', metrics= ['accuracy'])

for i, categories in enumerate(['produk', 'packaging', 'pengiriman', 'general']):
    model.fit(train_x, [t[i] for t in train_y], epochs= 16, verbose= 0)
    scores = model.evaluate(test_x, [t[i] for t in test_y])
    print("Accuracy report for {}: {}".format(categories, scores[1]))

## Polarization aspect for 'produk'

In [135]:
data_x = []
data_y = []
i = 1
for d, s in zip(preprocessed_data, data):
    if (s[i] != 0):
        data_x.append(d)
        data_y.append(1 if s[i] == 1 else 0)
        
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size= 0.1)

In [136]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

pipeline.fit(train_x, train_y)
prediction = pipeline.predict(test_x)
print("Classification report for {}".format(categories))
report = classification_report(test_y, prediction)
print(report)
joblib.dump(pipeline, "produk_sentiment.sav")

Classification report for general
              precision    recall  f1-score   support

           0       0.92      0.85      0.88        13
           1       0.93      0.96      0.95        27

   micro avg       0.93      0.93      0.93        40
   macro avg       0.92      0.90      0.91        40
weighted avg       0.92      0.93      0.92        40



['produk_sentiment.sav']

## Polarization aspect for 'packaging'

In [137]:
data_x = []
data_y = []
i = 2
for d, s in zip(preprocessed_data, data):
    if (s[i] != 0):
        data_x.append(d)
        data_y.append(1 if s[i] == 1 else 0)
        
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size= 0.1)

In [138]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

pipeline.fit(train_x, train_y)
prediction = pipeline.predict(test_x)
print("Classification report for {}".format(categories))
report = classification_report(test_y, prediction)
print(report)
joblib.dump(pipeline, "packaging_sentiment.sav")

Classification report for general
              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.90      0.90      0.90        10

   micro avg       0.88      0.88      0.88        17
   macro avg       0.88      0.88      0.88        17
weighted avg       0.88      0.88      0.88        17



['packaging_sentiment.sav']

## Polarization aspect for 'pengiriman'

In [139]:
data_x = []
data_y = []
i = 3
for d, s in zip(preprocessed_data, data):
    if (s[i] != 0):
        data_x.append(d)
        data_y.append(1 if s[i] == 1 else 0)
        
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size= 0.1)

In [140]:
pipeline = Pipeline([
                ('vectorize', TfidfVectorizer(ngram_range= (1,2))),
                ('clf', LinearSVC())
            ])

pipeline.fit(train_x, train_y)
prediction = pipeline.predict(test_x)
print("Classification report for {}".format(categories))
report = classification_report(test_y, prediction)
print(report)
joblib.dump(pipeline, "pengiriman_sentiment.sav")

Classification report for general
              precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.88      0.93      0.90        15

   micro avg       0.87      0.87      0.87        23
   macro avg       0.87      0.84      0.85        23
weighted avg       0.87      0.87      0.87        23



['pengiriman_sentiment.sav']