In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
import numpy as np
import warnings
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag
from nltk import word_tokenize

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('stopwords')

#Spacy
import spacy
nlp = spacy.load('en_core_web_sm')

# Other
import re
import json
import string
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

#Keras
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Activation

[nltk_data] Downloading package stopwords to /home/sirius/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [3]:
def parse_sentence_level(path):
    tree = ET.parse(path)
    root = tree.getroot()
    print(root)
    sent_id = []
    sent_text = []
    opinion_target = []
    opinion_category = []
    opinion_polarity = []
    for review in root.findall('Review'):
        for sent in review.findall('./sentences/sentence'):
            sent_id.append(sent.get('id'))
            sent_text.append(sent.find('text').text)
            target = ""
            polarity = ""
            category = ""
            for opinion in sent.findall('./Opinions/Opinion'):
                target += " " + opinion.get('target')
                polarity += " " + opinion.get('polarity')
                category += " " + opinion.get('category')
            opinion_target.append(target)
            opinion_category.append(category)
            opinion_polarity.append(polarity)
    return sent_id, sent_text, opinion_target, opinion_category, opinion_polarity

In [2]:
#xml parser
def get_list(path):
    tree=ET.parse(path)
    root = tree.getroot()
    text_list = []
    opinion_list = []
    for review in root.findall('Review'):
        text_string=""
        opinion_inner_list=[]
        for sent in review.findall('./sentences/sentence'):
            text_string= text_string+ " "+ sent.find('text').text
        text_list.append(text_string)
        for opinion in review.findall('./Opinions/Opinion'):
            opinion_dict = {
                opinion.get('category').replace('#','_'): opinion.get('polarity')
            }
            opinion_inner_list.append(opinion_dict)
        opinion_list.append(opinion_inner_list)
    return text_list,opinion_list

In [39]:
def parse_sentence_level2(path):
    tree = ET.parse(path)
    root = tree.getroot()
    print(root)
    sent_id = []
    sent_text = []
    opinion_target = []
    opinion_category = []
    opinion_polarity = []
    for review in root.findall('Review'):
        for sent in review.findall('./sentences/sentence'):
            try:
                sid = sent.get('id')
                text = sent.find('text').text
                target = ""
                polarity = ""
                category = ""
                num_opinion_units = len(sent.findall('./Opinions/Opinion'))
                if num_opinion_units>1:
                    for i,opinion in enumerate(sent.findall('./Opinions/Opinion')):
                        target = opinion.get('target')
                        polarity = opinion.get('polarity')
                        category = opinion.get('category')
                        splitter = None
                        if len(text.split("and")) == num_opinion_units:
                            splitter = 'and'
                        elif len(text.split("but")) == num_opinion_units:
                            splitter = "but"
                        elif len(text.split(",")) == num_opinion_units:
                            splitter = ","
                        if splitter:
                            text1 = text.split(splitter)[i]
                        else:
                            text1 = text
                        id1 = sid + chr(97+i)
                        sent_id.append(id1)
                        sent_text.append(text1)
                        opinion_target.append(target)
                        opinion_category.append(category)
                        opinion_polarity.append(polarity)
                else:
                    for opinion in sent.findall('./Opinions/Opinion'):
                        target = opinion.get('target')
                        polarity = opinion.get('polarity')
                        category = opinion.get('category')
                    sent_id.append(sid)
                    sent_text.append(text)
                    opinion_target.append(target)
                    opinion_category.append(category)
                    opinion_polarity.append(polarity)
            except:
                print(sent.get('id'))
    return sent_id, sent_text, opinion_target, opinion_category, opinion_polarity

In [40]:
train_sent_id, train_text,train_opinion_target,train_opinion_category, train_opinion_polarity  = parse_sentence_level2("ABSA16_Restaurants_Train_SB1_v2.xml")

<Element 'Reviews' at 0x7f858093e4d0>


In [41]:
reviews_train = pd.DataFrame([train_sent_id, train_text,train_opinion_target,train_opinion_category, train_opinion_polarity],
            index=['sentence_id','text','aspect_target','aspect_category','polarity']).T

In [42]:
reviews_train

Unnamed: 0,sentence_id,text,aspect_target,aspect_category,polarity
0,1004293:0,Judging from previous posts this used to be a ...,place,RESTAURANT#GENERAL,negative
1,1004293:1,"We, there were four of us, arrived at noon - t...",staff,SERVICE#GENERAL,negative
2,1004293:2,"They never brought us complimentary noodles, i...",,SERVICE#GENERAL,negative
3,1004293:3a,The food was lousy - too sweet or too salty,food,FOOD#QUALITY,negative
4,1004293:3b,the portions tiny.,portions,FOOD#STYLE_OPTIONS,negative
...,...,...,...,...,...
2794,FF#10:8,The waitress came to check in on us every few ...,waitress,SERVICE#GENERAL,negative
2795,FF#10:9,I couldn't ignore the fact that she reach over...,,SERVICE#GENERAL,negative
2796,FF#10:10,She then put the check down without asking if ...,,SERVICE#GENERAL,negative
2797,FF#10:11a,"I wish I could like this place more,",place,RESTAURANT#GENERAL,negative


In [43]:
print(reviews_train.groupby('aspect_category').size().sort_values(ascending=False))

#how many categories
print("number of categories",reviews_train.aspect_category.nunique())

aspect_category
FOOD#QUALITY                849
SERVICE#GENERAL             449
RESTAURANT#GENERAL          422
                            292
AMBIENCE#GENERAL            255
FOOD#STYLE_OPTIONS          137
RESTAURANT#MISCELLANEOUS     98
FOOD#PRICES                  90
RESTAURANT#PRICES            80
DRINKS#QUALITY               47
DRINKS#STYLE_OPTIONS         32
LOCATION#GENERAL             28
DRINKS#PRICES                20
dtype: int64
number of categories 13


In [44]:
reviews_train.shape

(2799, 5)

In [46]:
reviews_train.polarity.value_counts()

positive    1657
negative     749
             292
neutral      101
Name: polarity, dtype: int64

In [47]:
reviews_train.to_excel("20200204_Cleaned_df_v2.xlsx", index=False)

In [48]:
absa_model = Sequential()
absa_model.add(Dense(512, input_shape=(6000,), activation='relu'))
absa_model.add((Dense(256, activation='relu')))
absa_model.add((Dense(128, activation='relu')))
absa_model.add(Dense(13, activation='softmax'))
#compile model
absa_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [49]:
from keras.preprocessing.text import Tokenizer

In [50]:
vocab_size = 6000 # We set a maximum size for the vocabulary
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(reviews_train.text)
reviews_tokenized = pd.DataFrame(tokenizer.texts_to_matrix(reviews_train.text))

In [51]:
from sklearn.preprocessing import LabelEncoder

In [52]:
from keras.utils import to_categorical

In [53]:
label_encoder = LabelEncoder()
integer_category = label_encoder.fit_transform(reviews_train.aspect_category)
dummy_category = to_categorical(integer_category)

In [54]:
reviews_train.polarity.unique()

array(['negative', 'positive', 'neutral', ''], dtype=object)

In [55]:
#model architecture
sentiment_model = Sequential()
sentiment_model.add(Dense(512, input_shape=(6000,), activation='relu'))
sentiment_model.add((Dense(256, activation='relu')))
sentiment_model.add((Dense(128, activation='relu')))
sentiment_model.add(Dense(4, activation='softmax'))
#compile model
sentiment_model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

#create a word embedding of reviews data
vocab_size = 6000 # We set a maximum size for the vocabulary
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(reviews_train.text)
reviews_tokenized = pd.DataFrame(tokenizer.texts_to_matrix(reviews_train.text))

#encode the label variable
label_encoder2 = LabelEncoder()
integer_sentiment = label_encoder2.fit_transform(reviews_train.polarity)
dummy_sentiment = to_categorical(integer_sentiment)

In [56]:
reviews_train.polarity.nunique()

4

In [58]:
absa_model.fit(reviews_tokenized, dummy_category, epochs=100, verbose=1, )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7f8579bb0e50>

In [59]:
sentiment_model.fit(reviews_tokenized, dummy_sentiment, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7f8580ad38d0>

In [64]:
import pickle

In [65]:
pickle.dump(absa_model, open("semeval_5.2_absa.pkl", 'wb'))

In [66]:
pickle.dump(sentiment_model, open("semeval_5.2_sent.pkl", 'wb'))

In [60]:
test_reviews = [
    "Good, fast service.",
    "The hostess was very pleasant.",
    "The bread was stale, the salad was overpriced and empty.",
    "The food we ordered was excellent, although I wouldn't say the margaritas were anything to write home about.",
    "This place has totally weird decor, stairs going up with mirrored walls - I am surprised how no one yet broke their head or fall off the stairs"
]

In [61]:
test_reviews = [review.lower() for review in test_reviews]
test_aspect_terms = []
for review in nlp.pipe(test_reviews):
    chunks = [(chunk.root.text) for chunk in review.noun_chunks if chunk.root.pos_ == 'NOUN']
    test_aspect_terms.append(' '.join(chunks))
test_aspect_terms = pd.DataFrame(tokenizer.texts_to_matrix(test_aspect_terms))

In [63]:
test_sentiment_terms = []
for review in nlp.pipe(test_reviews):
        if review.is_parsed:
            test_sentiment_terms.append(' '.join([token.lemma_ for token in review if (not token.is_stop and not token.is_punct and (token.pos_ == "ADJ" or token.pos_ == "VERB"))]))
        else:
            test_sentiment_terms.append('') 
test_sentiment_terms = pd.DataFrame(tokenizer.texts_to_matrix(test_sentiment_terms))

# Models output
test_aspect_categories = label_encoder.inverse_transform(absa_model.predict_classes(test_aspect_terms))
test_sentiment = label_encoder2.inverse_transform(sentiment_model.predict_classes(test_sentiment_terms))
for i in range(5):
    print("Review " + str(i+1) + " is expressing a " + test_sentiment[i] + " opinion about " + test_aspect_categories[i])


Review 1 is expressing a positive opinion about SERVICE#GENERAL
Review 2 is expressing a positive opinion about SERVICE#GENERAL
Review 3 is expressing a negative opinion about FOOD#QUALITY
Review 4 is expressing a positive opinion about FOOD#QUALITY
Review 5 is expressing a negative opinion about AMBIENCE#GENERAL
