In [1]:
import os
from collections import Counter
import xml.etree.ElementTree as ET

dir_path = os.path.dirname(os.path.realpath("__file__"))

train_data_dir = dir_path +'/ABSA complete Dataset/ABSA Train/Restaurants_Train.xml'

def read_and_preprocess_data(data_directory):
    tree = ET.parse(data_directory)
    root = tree.getroot()
    sentences, aspect_terms = [],[]
    unique_aspect_terms = set(list())
    sentence_iterator = root.iter('sentence')
    for index, sentence in enumerate(sentence_iterator):
        sentence_text = sentence.find('text').text
        aspect_term = list()
        for aspect in sentence.iter('aspectTerm'):
            term = aspect.get('term')
            unique_aspect_terms.add(term)
            sentiment = aspect.get('polarity')
            if sentiment.lower()!="conflict":
                aspect_term.append((term,sentiment))
        aspect_terms.append(aspect_term)
        sentences.append(sentence_text)
    return sentences,aspect_terms,unique_aspect_terms
    


In [2]:
sentences,aspect_terms,unique_aspects = read_and_preprocess_data(train_data_dir)

In [4]:
sentences[2000]

'Good bagels and good cream cheese.'

In [5]:
aspect_terms[0]

[('staff', 'negative')]

In [6]:
index_aspect_mapping = {index:value for index,value in enumerate(unique_aspects)}

In [7]:
index_label_mapping = {index:value for index,value in enumerate(['negative','neutral','positive'])}
label_index_mapping = {value:index for index,value in enumerate(['negative','neutral','positive'])}

In [8]:
index_label_mapping

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [9]:
label_index_mapping

{'negative': 0, 'neutral': 1, 'positive': 2}

In [10]:
len(sentences)

3044

In [11]:
def create_dataset(sentences,aspect_terms_sentiment):
    print("sentences",len(sentences),len(aspect_terms_sentiment))
    input_instances = []
    target_words =[]
    labels = []
    for index,aspect_sentiment_pairs in enumerate(aspect_terms_sentiment):
#         print(aspect_sentiment_pairs)
        for aspect_senti in aspect_sentiment_pairs:
#             print("index",index)
            input_instances.append(sentences[index])
            target_words.append(aspect_senti[0])
            labels.append(aspect_senti[1])
    return input_instances,target_words,labels
        
    

In [12]:
input_text,target_words,labels = create_dataset(sentences,aspect_terms)

sentences 3044 3044


In [41]:
get_labels(2)

'positive'

In [13]:
data_dict = {"text":input_text,'aspect_words':target_words,'labels':labels}

In [14]:
import pandas as pd
dataset = pd.DataFrame(data_dict)
dataset

Unnamed: 0,text,aspect_words,labels
0,But the staff was so horrible to us.,staff,negative
1,"To be completely fair, the only redeeming fact...",food,positive
2,"The food is uniformly exceptional, with a very...",food,positive
3,"The food is uniformly exceptional, with a very...",kitchen,positive
4,"The food is uniformly exceptional, with a very...",menu,neutral
...,...,...,...
3603,Each table has a pot of boiling water sunken i...,pot of boiling water,neutral
3604,Each table has a pot of boiling water sunken i...,meats,neutral
3605,Each table has a pot of boiling water sunken i...,vegetables,neutral
3606,Each table has a pot of boiling water sunken i...,rice,neutral


In [15]:
from sklearn.model_selection import train_test_split
train, val= train_test_split(dataset, train_size = 0.90)


In [16]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
train['labels'] = LE.fit_transform(train['labels'])
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,aspect_words,labels
2325,We had great desserts (including the best cann...,cannoli,2
1578,Great pizza for lunch place.,pizza,2
919,It's a great place to pick up a cheap lunch or...,lunch,2
2071,Its a nice quiet location to go eat a good mea...,location,2
2427,"The first 2 courses were very good, but the ch...",dessert wine,0
...,...,...,...
2307,Zero ambiance to boot.,ambiance,0
2814,"Kind, attentive wait staff.",wait staff,2
1483,$20 gets you unlimited sushi of a very high qu...,sushi places,2
2503,"Their sushi, Kamikaze and other Rolls are fres...",sushi,2


In [17]:
val['labels'] = LE.transform(val['labels'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
val

Unnamed: 0,text,aspect_words,labels
69,This is a consistently great place to dine for...,lunch,1
2723,The workers there also absolutely load the bag...,workers,1
3170,"If it isn't for the food (A+++), it must be th...",service,2
2790,The whole set up is truly unprofessional and I...,staff,0
3214,The staff was too busy ordering sushi for dinn...,sushi,1
...,...,...,...
1949,We ordered some beef and noodle soup dishes fr...,beef,0
1050,"Looking around, I saw a room full of New Yorke...",clubhouse,0
3434,THE BIG COMPLAINT: NO TOASTING AVAILABLE.,TOASTING,0
3530,Interesting other dishes for a change include ...,salmon caserole,2


In [19]:
def get_labels(prediction):
    predicted_label =  LE.inverse_transform([prediction])
    return predicted_label[0]

In [20]:
get_labels(0)

'negative'

In [21]:
import joblib
joblib.dump(LE,'sentiment_label_encoder_restaurent')

['sentiment_label_encoder_restaurent']

In [22]:
train['labels'].value_counts()


2    1940
0     722
1     585
Name: labels, dtype: int64

In [23]:
val['labels'].value_counts()

2    224
0     85
1     52
Name: labels, dtype: int64

In [24]:
list(train['text'].values)

["We had great desserts (including the best cannoli I've ever had) and then they offered an after dinner drink, on the house.",
 'Great pizza for lunch place.',
 "It's a great place to pick up a cheap lunch or dinner.",
 'Its a nice quiet location to go eat a good meal, relax, be able to talk and have a very good time.',
 'The first 2 courses were very good, but the chocolate sampler was too rich for me and the dessert wine far too sweet.',
 'During the course of the past 3 months, the chef and staff changed and it was not for the better.',
 'I really like both the scallops and the mahi mahi (on saffron risotto-yum!).',
 'Meanwhile, the bartender continued to pour champagne from his reserve after we had finished our bottle and we enjoyed an amuse of turnip soup with pureed basil, gratis.',
 "It's all about the food !!",
 'If you are in a big group, this place is perfect because it recomends sharing - they have lazy susans on most tables - even families can feel comfortable here.',
 "Th

In [25]:
import re
def clean_sentence(review):
  review = re.sub('<[^>]*>', ' ',review)
  review = re.sub('�', ' ', review)
  review = re.sub('https?:[a-z\S]+','',review)
  review = re.sub('\xa0','',review)
  review = re.sub('_','',review)
  review = re.sub(u'\u2004','',review)
  review = re.sub(u'\u2009','',review)

  review = re.sub('&nbsp','',review)
  review = re.sub('&ndash','',review)
  review = re.sub('\r','',review)
  review = re.sub('\t','',review)
  review = re.sub('\n',' ',review)
 
  review = re.sub('&thinsp','',review)
  review = re.sub('&times','',review)
  review = re.sub('\u200b','',review)
  review = re.sub('&rarr;;;','',review)
 
  return review

In [26]:
train["text"] = train["text"] .apply(str)
output = train["text"].apply(lambda x: clean_sentence(x))
train["text"] = output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [27]:
val["text"] = val["text"] .apply(str)
output = val["text"].apply(lambda x: clean_sentence(x))
val["text"] = output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
train

Unnamed: 0,text,aspect_words,labels
2325,We had great desserts (including the best cann...,cannoli,2
1578,Great pizza for lunch place.,pizza,2
919,It's a great place to pick up a cheap lunch or...,lunch,2
2071,Its a nice quiet location to go eat a good mea...,location,2
2427,"The first 2 courses were very good, but the ch...",dessert wine,0
...,...,...,...
2307,Zero ambiance to boot.,ambiance,0
2814,"Kind, attentive wait staff.",wait staff,2
1483,$20 gets you unlimited sushi of a very high qu...,sushi places,2
2503,"Their sushi, Kamikaze and other Rolls are fres...",sushi,2


In [30]:
train.to_csv("train_sentiment_absa_restaurent.csv",index=False)
val.to_csv("val_sentiment_absa_restaurent.csv",index=False)


## Test set

In [31]:
test_data_dir = dir_path +'/ABSA complete Dataset/ABSA Test/Restaurants_Test_Gold.xml'

test_sentences,test_aspect_terms,test_unique_aspects = read_and_preprocess_data(test_data_dir)

In [32]:
input_text,target_words,labels = create_dataset(test_sentences,test_aspect_terms)

sentences 800 800


In [33]:
test_data_dict = {"text":input_text,'aspect_words':target_words,'labels':labels}

In [34]:
import pandas as pd
test = pd.DataFrame(test_data_dict)

In [35]:
test

Unnamed: 0,text,aspect_words,labels
0,The bread is top notch as well.,bread,positive
1,I have to say they have one of the fastest del...,delivery times,positive
2,Food is always fresh and hot- ready to eat!,Food,positive
3,Did I mention that the coffee is OUTSTANDING?,coffee,positive
4,"Certainly not the best sushi in New York, howe...",place,positive
...,...,...,...
1115,"Creamy appetizers--taramasalata, eggplant sala...",Creamy appetizers,positive
1116,"Creamy appetizers--taramasalata, eggplant sala...",warm pitas,neutral
1117,"Creamy appetizers--taramasalata, eggplant sala...",taramasalata,positive
1118,"Creamy appetizers--taramasalata, eggplant sala...",eggplant salad,positive


In [36]:
test['labels'] = LE.transform(test['labels'])
test

Unnamed: 0,text,aspect_words,labels
0,The bread is top notch as well.,bread,2
1,I have to say they have one of the fastest del...,delivery times,2
2,Food is always fresh and hot- ready to eat!,Food,2
3,Did I mention that the coffee is OUTSTANDING?,coffee,2
4,"Certainly not the best sushi in New York, howe...",place,2
...,...,...,...
1115,"Creamy appetizers--taramasalata, eggplant sala...",Creamy appetizers,2
1116,"Creamy appetizers--taramasalata, eggplant sala...",warm pitas,1
1117,"Creamy appetizers--taramasalata, eggplant sala...",taramasalata,2
1118,"Creamy appetizers--taramasalata, eggplant sala...",eggplant salad,2


In [37]:
test["labels"].value_counts()

2    728
1    196
0    196
Name: labels, dtype: int64

In [38]:
test["text"] = test["text"] .apply(str)
output = test["text"].apply(lambda x: clean_sentence(x))
test["text"] = output

In [39]:
list(test["text"].values)

['The bread is top notch as well.',
 'I have to say they have one of the fastest delivery times in the city.',
 'Food is always fresh and hot- ready to eat!',
 'Did I mention that the coffee is OUTSTANDING?',
 'Certainly not the best sushi in New York, however, it is always fresh, and the place is very clean, sterile.',
 'I trust the people at Go Sushi, it never disappoints.',
 'Straight-forward, no surprises, very decent Japanese food.',
 'BEST spicy tuna roll, great asian salad.',
 'BEST spicy tuna roll, great asian salad.',
 'Try the rose roll (not on menu).',
 'Try the rose roll (not on menu).',
 'I love the drinks, esp lychee martini, and the food is also VERY good.',
 'I love the drinks, esp lychee martini, and the food is also VERY good.',
 'I love the drinks, esp lychee martini, and the food is also VERY good.',
 'In fact, this was not a Nicoise salad and was barely eatable.',
 "While there's a decent menu, it shouldn't take ten minutes to get your drinks and 45 for a dessert p

In [40]:
test.to_csv("test_sentiment_ABSA_restaurent.csv",index=False)