In [2]:
import os
import cv2
import imutils
import nltk 
import string
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imutils import paths
from keras.models import Sequential
from keras.layers import Conv2D, Activation, MaxPooling2D, Dense
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split, KFold, cross_val_score,cross_val_predict
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from contractions import CONTRACTIONS_DICT 
from Convert_Negation import CONVERT_NEGATION
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold


### Preprocessing


In [3]:
def NER(review):
    for i in range(len(review)):
        text = review.Review.iloc[i]
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label') and chunk.label:
                    if chunk.label() == 'ORGANIZATION' or  chunk.label() == 'PERSON' or  chunk.label() == 'DATE' or  chunk.label() == 'LOCATION':
                        name_value = ' '.join(child[0] for child in chunk.leaves())
                        text = text.replace(name_value, "")
                        review.Review.iloc[i] = text
    return review

def case_folding(review):
    for i in range(len(review)):
        text = review.Review[i].lower()
        review.Review.iloc[i] = text 
    return review

def expand_contraction(review):
    contractions_re = re.compile('(%s)' % '|'.join(CONTRACTIONS_DICT.keys()))
    d = {}
    index=0
    for i in review.Review:
        text = i
        def replace(match):
            return CONTRACTIONS_DICT[match.group(0)]
        text = contractions_re.sub(replace, text)
        #review = review.replace(text,"")
        review.Review[index] = text
        index+=1
    return review

def convert_negation(review):
    convertNegation_re = re.compile('(%s)' % '|'.join(CONVERT_NEGATION.keys()))
    d = {}
    index=0
    for i in review.Review:
        text = i
        def replace(match):
            return CONVERT_NEGATION[match.group(0)]
        text = convertNegation_re.sub(replace, text)
        review.Review[index] = text
        index+=1
    return review

def remove_punctuation(review):
    remove = string.punctuation
    for i in range(len(review)):
        for kata in remove:
            text = review.Review[i].replace(kata,"")
            review.Review.iloc[i] = text 
    return review

def stop_removal(review):
    from nltk.tokenize import sent_tokenize, word_tokenize
    cachedStopWords = set(stopwords.words("english"))
    for i in range(len(review)):
        text = review.Review.iloc[i]
        teks =" ".join([word for word in text.split() if word not in cachedStopWords])
        review.Review.iloc[i] = teks
    return review

def stemming(review):
    ps = PorterStemmer()
    for i in range(len(review)):
        text = review.Review.iloc[i]
        text = ps.stem(text)
        review.Review.iloc[i] = text
    return review

def lemmatization (review):
    lm = WordNetLemmatizer()
    for i in range(len(review)):
        text = review.Review.iloc[i]
        text = lm.lemmatize(text)
        review.Review.iloc[i] = text
    return review

def preprocessing_data(review):
    hasil_ner = NER(review)
    hasil_case_folding = case_folding(hasil_ner)
    hasil_convert_negation = convert_negation(hasil_case_folding)
    hasil_expand = expand_contraction(hasil_convert_negation)
    hasil_remove_punctuation = remove_punctuation(hasil_expand)
    hasil_stop_removal = stop_removal(hasil_remove_punctuation)
    hasil_stemming = stemming( hasil_stop_removal)
    hasil_lemmatization = lemmatization(hasil_stemming)
    return hasil_lemmatization

In [3]:
#### load SVM model for klasifikasi
filename1= 'model_sentimen 4.pkl'
loaded_model = pickle.load(open(filename1, 'rb'))
# ##########################################################################################################
#### load SVM feature for klasifikasi
filename1='feature.pkl'
loaded_feature = pickle.load(open(filename1, 'rb'))

In [4]:
data_review_buku = pd.read_excel('E:/_KEPERLUAN FINAL TA/TASI-14_REVISI_FIKS/data_review/Data_review_buku.xlsx')

In [5]:
data_review_buku.head(10)

Unnamed: 0,Buku,Review,Rating,Kategori
0,1,I was disgusted by the author’s shameless atte...,2.0,Best Seller
1,1,I am familiar with this tragedy. I found this ...,1.0,Best Seller
2,1,It is very sad to see the authors of this book...,1.0,Best Seller
3,2,Who would write this garbage,1.0,Best Seller
4,2,I HIGHLY doubt they would have slit anyone's ...,1.0,Best Seller
5,2,Very sad they just wanted to make a quick buck.,1.0,Best Seller
6,2,Do not buy or read this trash. My father was a...,1.0,Best Seller
7,2,This is just disrespectful to our soldiers.,1.0,Best Seller
8,3,"If you only read one book this year, this is t...",5.0,Best Seller
9,3,I beg you...get your hands on a copy of this book,5.0,Best Seller


In [6]:
best  = len([x for x in data_review_buku['Kategori'] if x == 'Best Seller'])
non_best = len([x for x in data_review_buku['Kategori'] if x == 'Non Best Seller'])

best, non_best

(1386, 1068)

In [11]:
data_clean_review_buku = preprocessing_data(data_review_buku)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
import re

pattern=r'[^a-zA-Z]'

for i in range(len(data_clean_review_buku)):
    data_clean_review_buku['Review'].iloc[i] = re.sub(pattern,' ', data_clean_review_buku['Review'].iloc[i], flags=re.MULTILINE)
data_clean_review_buku.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Buku,Review,Rating,Kategori
0,1,disgusted author s shameless attempt use trage...,2.0,Best Seller
1,1,familiar tragedy found writing tragedy superfici,1.0,Best Seller
2,1,sad see authors book taken personal stories su...,1.0,Best Seller
3,2,would write garbag,1.0,Best Seller
4,2,highly doubt would slit anyones throat drink b...,1.0,Best Seller
5,2,sad wanted make quick buck,1.0,Best Seller
6,2,bad read trash father survivor tragedy publish...,1.0,Best Seller
7,2,disrespectful soldi,1.0,Best Seller
8,3,read one book year on,5.0,Best Seller
9,3,beg youget hands copy book,5.0,Best Seller


In [14]:
hasil_preprocessing = data_clean_review_buku.to_excel('datareviewbuku_after_preproced.xlsx', encoding='utf-8')

In [7]:
data_clean_review = pd.read_excel('E:/_KEPERLUAN FINAL TA/TASI-14_REVISI_FIKS/datareviewbuku_after_preproced.xlsx')

In [8]:
data_clean_review

Unnamed: 0,Buku,Review,Rating,Kategori
0,1,disgusted author s shameless attempt use trage...,2.0,Best Seller
1,1,familiar tragedy found writing tragedy superfici,1.0,Best Seller
2,1,sad see authors book taken personal stories su...,1.0,Best Seller
3,2,would write garbag,1.0,Best Seller
4,2,highly doubt would slit anyones throat drink b...,1.0,Best Seller
5,2,sad wanted make quick buck,1.0,Best Seller
6,2,bad read trash father survivor tragedy publish...,1.0,Best Seller
7,2,disrespectful soldi,1.0,Best Seller
8,3,read one book year on,5.0,Best Seller
9,3,beg youget hands copy book,5.0,Best Seller


In [10]:
arr_data = []
for index, row in data_clean_review.iterrows():
    arr_data.append([row['Buku'],row['Kategori']])

In [12]:
feature_extraction = loaded_feature.transform(data_clean_review.Review.values.astype('U'))
predicted = loaded_model.predict_proba(feature_extraction)       
score =[]
for i in predicted:
    score.append(i)

sentimen_score = []
for i in range(len(score)):
    sentimen_score.append(score [i][1])
     

In [13]:
for i in range(len(sentimen_score)):
    data = arr_data[i]
    data.append(sentimen_score[i])
    arr_data[i] = data

In [14]:
temp_book = ''
i = 0
for j in range(len(arr_data)):
    temp_score = 0.0
    if(temp_book == arr_data[j][0]):
        i+=1
    else:
        for k in range(j-i, j):
            temp_score +=arr_data[k][2]
        if(temp_score > 0.0):
            data = arr_data[j-1]
            data.append(temp_score/i)
            arr_data[j-1] = data
        i = 1
    if(j == len(arr_data)-1):
        for k in range(j-i+1, j+1):
            temp_score +=arr_data[k][2]
        if(temp_score > 0.0):
            data = arr_data[j]
            data.append(temp_score/i)
            arr_data[j] = data
    temp_book = arr_data[j][0]
      

In [15]:
data_fitur_sentimen = [x for x in arr_data if(len(x)>3)]
for i in range(len(data_fitur_sentimen)):
    data = data_fitur_sentimen[i][:2]
    data.append(data_fitur_sentimen[i][3])
    data_fitur_sentimen[i] = data

In [16]:
fitur_sentimen = pd.DataFrame(data_fitur_sentimen)
label = fitur_sentimen[1]
score = fitur_sentimen[2]

In [17]:
fitur_sentimen

Unnamed: 0,0,1,2
0,1,Best Seller,0.538153
1,2,Best Seller,0.163193
2,3,Best Seller,0.901985
3,4,Best Seller,0.901863
4,5,Best Seller,0.586741
5,6,Best Seller,0.291871
6,7,Best Seller,0.572761
7,8,Best Seller,0.313185
8,9,Best Seller,0.580751
9,10,Best Seller,0.586365


In [18]:
category_encoder = LabelEncoder()
y = category_encoder.fit_transform(label)

In [19]:
fitur_sentimen = np.array(score)
fitur_sentimen = fitur_sentimen.reshape(-1, 1)
C = 1.0  # SVM regularization parameter
model_sentimen = SVC(kernel='linear', C=C)

p = cross_val_score(model_sentimen, fitur_sentimen, y, cv=10, scoring= 'precision')
precision_sentimen_feature = (sum(p)/len(p))
r = cross_val_score(model_sentimen, fitur_sentimen, y, cv=10, scoring='recall')
recall_sentimen_feature = (sum(r)/len(r)) 
f = cross_val_score(model_sentimen, fitur_sentimen, y, cv=10, scoring='f1')
f_measure_sentimen_feature = (sum(f)/len(f)) 


In [20]:
precision_sentimen_feature,recall_sentimen_feature,f_measure_sentimen_feature

(0.48135641065086487, 0.39714285714285719, 0.42237611147877291)