In [None]:
# Library import
import xml.etree.ElementTree as ET
import spacy
import string
import numpy as np
import json
import random
import pickle
# for auto-correct words in review text
from autocorrect import Speller
spell = Speller()

In [None]:
xml_filepath = './dataset/ABSA16_Restaurants_Train_SB1_v2.xml'
# create element tree object
tree = ET.parse(xml_filepath)

# get root element
root = tree.getroot()

In [None]:
data = []
for it1 in root.findall('Review'):
    for it2 in it1.findall('sentences'):
        for it3 in it2.findall('sentence'):
            for it4 in it3.findall('text'):
                review_text = spell(it4.text.lower())

            polarity_sum = 0
            for it4 in it3.findall('Opinions'):
                for it5 in it4.findall('Opinion'):
                    if it5.attrib['polarity'] == 'negative':
                        polarity_sum += -1
                    elif it5.attrib['polarity'] == 'neutral':
                        polarity_sum += 0
                    else:
                        polarity_sum += 1
                    
            # 0 negtaive, 1 neutral, 2 positive
            if polarity_sum == 0:
                polarity = 1  #neutral
            elif polarity_sum > 0:
                polarity = 2  #positive
            else:
                polarity = 0  #negative

            data.append([review_text,polarity])

In [None]:
print(len(data))

In [None]:
# sm is small here, can use large here as lg here by downloading the large file
# pip install -U pip setuptools wheel
# pip install -U spacy
# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")


# 200 dimensions vectors for the tokens. Giving 0's to padding token and generating randomly for oov tokens.
emmbedding_dimensions = 300
emmbed_dict = {}
emmbed_dict['pad'] = np.zeros(emmbedding_dimensions)

with open('../dataset/glove.6B/glove.6B.300d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:],'float64')
        emmbed_dict[word]=vector

emmbed_dict['pad'] = np.zeros(emmbedding_dimensions)

vec = []
for _ in range(emmbedding_dimensions):
    vec.append(random.uniform(-1,1))

emmbed_dict['oov'] = np.array(vec)

In [None]:
# removed punctuations
train_data_vocab = set()  #also building training data vocab
train_data_vocab.add('pad')
train_data_vocab.add('oov')
max_sent_length = 0

for i in range(len(data)):
    sent = data[i][0]
    token_lst = []
    for it in nlp(sent):
        token = it.text
        str = ''
        for ch in token:
            if ch == "'" or ch not in string.punctuation:
                if ch == "'":
                    str += ' '
                str += ch
            else:
                str += ' '
                
        lst = str.split()
        for token in lst:
            if token in emmbed_dict:
                token_lst.append(token)
                train_data_vocab.add(token)
            else:
                token_lst.append('oov')

    max_sent_length = max(len(token_lst), max_sent_length)
    data[i][0] = token_lst


print('Maximum Sentence Length is: ',max_sent_length)

In [None]:
# token to index for training data
token_to_index = {}
for cnt, token in enumerate(train_data_vocab):
    token_to_index[token] = cnt

In [None]:
# sentence level padding and converting to indices
for i in range(len(data)):
    sent_ind = []
    for j in range(max_sent_length):
        if j < len(data[i][0]):
            sent_ind.append(token_to_index[data[i][0][j]])
        else:
            sent_ind.append(token_to_index['pad'])
    data[i][0] = sent_ind

In [None]:
embedding_matrix = []
for token in train_data_vocab:
    embedding_matrix.append(emmbed_dict[token])

embedding_matrix = np.array(embedding_matrix)

In [None]:
# saving embedding matrix in numpy format
np.save('embedding_matrix_restaurants_sentence_level',embedding_matrix)

In [None]:
dump_this_dict = {}
for i in range(len(data)):
    dump_this_dict[i] = data[i]

In [None]:
with open('train_restaurants_sentence_level.json', 'wb') as f:
    pickle.dump(dump_this_dict, f)

In [None]:
with open('train_restaurants_sentence_level.json', 'rb') as f:
    data = pickle.load(f)

In [None]:
# dumping token to index, so that we can create test data
with open('token_to_index_restaurants_sentence_level.json', 'wb') as f:
    pickle.dump(token_to_index, f)

In [None]:
print(embedding_matrix.shape)

In [None]:
total = 0
pos = 0
neg = 0
neu = 0

for i in range(len(data)):
    if data[i][1] == 0:
        neg += 1
    elif data[i][1] == 1:
        neu += 1
    else:
        pos += 1

    total += 1

print(total, pos, neg, neu)