In [271]:
import os
from collections import Counter
import xml.etree.ElementTree as ET

dir_path = os.path.dirname(os.path.realpath("__file__"))

train_data_dir = dir_path +'/ABSA complete Dataset/ABSA Train/Laptops_Train.xml'

def read_and_preprocess_data(data_directory):
    tree = ET.parse(data_directory)
    root = tree.getroot()
    sentences, aspect_terms = [],[]
    unique_aspect_terms = set(list())
    sentence_iterator = root.iter('sentence')
    for index, sentence in enumerate(sentence_iterator):
        sentence_text = sentence.find('text').text
        aspect_term = list()
        for aspect in sentence.iter('aspectTerm'):
            term = aspect.get('term')
            unique_aspect_terms.add(term)
            sentiment = aspect.get('polarity')
            if sentiment.lower()!="conflict":
                aspect_term.append((term,sentiment))
        aspect_terms.append(aspect_term)
        sentences.append(sentence_text)
    return sentences,aspect_terms,unique_aspect_terms
    


In [272]:
sentences,aspect_terms,unique_aspects = read_and_preprocess_data(train_data_dir)

In [273]:
sentences[3047]

'I think I might rather suffer for something that is simple to fix in my opinion.'

In [274]:
aspect_terms[0]

[('cord', 'neutral'), ('battery life', 'positive')]

In [275]:
index_aspect_mapping = {index:value for index,value in enumerate(unique_aspects)}

In [276]:
index_label_mapping = {index:value for index,value in enumerate(['negative','neutral','positive'])}
label_index_mapping = {value:index for index,value in enumerate(['negative','neutral','positive'])}

In [277]:
index_label_mapping

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [278]:
label_index_mapping

{'negative': 0, 'neutral': 1, 'positive': 2}

In [279]:
len(sentences)

3048

In [280]:
def create_dataset(sentences,aspect_terms_sentiment):
    print("sentences",len(sentences),len(aspect_terms_sentiment))
    input_instances = []
    target_words =[]
    labels = []
    for index,aspect_sentiment_pairs in enumerate(aspect_terms_sentiment):
#         print(aspect_sentiment_pairs)
        for aspect_senti in aspect_sentiment_pairs:
#             print("index",index)
            input_instances.append(sentences[index])
            target_words.append(aspect_senti[0])
            labels.append(aspect_senti[1])
    return input_instances,target_words,labels
        
    

In [281]:
input_text,target_words,labels = create_dataset(sentences,aspect_terms)

sentences 3048 3048


In [282]:
data_dict = {"text":input_text,'aspect_words':target_words,'labels':labels}

In [283]:
import pandas as pd
dataset = pd.DataFrame(data_dict)
dataset

Unnamed: 0,text,aspect_words,labels
0,I charge it at night and skip taking the cord ...,cord,neutral
1,I charge it at night and skip taking the cord ...,battery life,positive
2,The tech guy then said the service center does...,service center,negative
3,The tech guy then said the service center does...,"""sales"" team",negative
4,The tech guy then said the service center does...,tech guy,neutral
...,...,...,...
2323,We also use Paralles so we can run virtual mac...,Windows 7 Home Premium,neutral
2324,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral
2325,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral
2326,"How Toshiba handles the repair seems to vary, ...",repair,positive


In [284]:
from sklearn.model_selection import train_test_split
train, val= train_test_split(dataset, train_size = 0.90)


In [285]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
train['labels'] = LE.fit_transform(train['labels'])
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,aspect_words,labels
1932,Could not keep up with me and finally the hard...,hard drive,0
2178,It is easy to navigate and update programs.,navigate,2
936,"Laptops are usually used on the go, so why not...",battery,0
1546,Every driver on the drivers/applications DVD i...,driver,2
1847,Its small enough where I can take it pretty mu...,screen,2
...,...,...,...
2069,A seventy dollar mouse!,mouse,1
1253,He loves it and it is easy to use and well the...,use,2
1054,This wiped out several programs that were inst...,programs,0
168,"Yes, they cost more, but they more than make u...",speed,2


In [286]:
val['labels'] = LE.transform(val['labels'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [287]:
val

Unnamed: 0,text,aspect_words,labels
1069,The only downfall is a lot of the software I h...,iWork,0
226,"Of course, I also have several great software ...",software packages,2
815,"It was a great laptop, ran great and was reall...",ran,2
433,With today's company fighting over marketshare...,staff,0
357,"I reloaded with Windows 7 Ultimate, and the Bl...",Fingerprint reader,0
...,...,...,...
439,"it has 3 usb ports, 1 sd memory card reader an...",sd memory card reader,1
290,The store honored their warrenty and made the ...,warrenty,2
876,it is very easy for anyone to use an apple and...,use,2
1323,"I just plug this into my 22"" Monitor and the s...",dual-core,1


In [288]:
len(list(set(val["text"].values).intersection(set(train["text"].values))))

138

In [289]:
def get_labels(prediction):
    predicted_label =  LE.inverse_transform([prediction])
    return predicted_label[0]

In [290]:
get_labels(0)

'negative'

In [291]:
import joblib
joblib.dump(LE,'sentiment_label_encoder')

['sentiment_label_encoder']

In [292]:
train['labels'].value_counts()


2    893
0    782
1    420
Name: labels, dtype: int64

In [293]:
val['labels'].value_counts()

2    101
0     88
1     44
Name: labels, dtype: int64

In [294]:
list(train['text'].values)

['Could not keep up with me and finally the hard drive went out.',
 'It is easy to navigate and update programs.',
 'Laptops are usually used on the go, so why not give you a better battery?',
 'Every driver on the drivers/applications DVD is everything you will need for a reload.',
 'Its small enough where I can take it pretty much anywhere, but still has a big enough screen to get everything done.',
 'Tried to make a recovey disk wouldnt get passed the first recovery disk.',
 'Ease of use is just one of the benefits I love about my Mac.',
 'Externally the keys on my keyboard are falling off, after a few uses the paint is rubbing off the button below the mouse pad and where the heals of my hands sit, and the screen has a terrible glare.',
 'Theres also iDVD, a program dedicated to putting all your favorite media together- photos, recordings, video projects into one program so that you can create the perfect memoir for your parents, family, siblings, and any other person important in y

In [295]:
import re
def clean_sentence(review):
  review = re.sub('<[^>]*>', ' ',review)
  review = re.sub('�', ' ', review)
  review = re.sub('https?:[a-z\S]+','',review)
  review = re.sub('\xa0','',review)
  review = re.sub('_','',review)
  review = re.sub(u'\u2004','',review)
  review = re.sub(u'\u2009','',review)

  review = re.sub('&nbsp','',review)
  review = re.sub('&ndash','',review)
  review = re.sub('\r','',review)
  review = re.sub('\t','',review)
  review = re.sub('\n',' ',review)
 
  review = re.sub('&thinsp','',review)
  review = re.sub('&times','',review)
  review = re.sub('\u200b','',review)
  review = re.sub('&rarr;;;','',review)
 
  return review

In [296]:
train["text"] = train["text"] .apply(str)
output = train["text"].apply(lambda x: clean_sentence(x))
train["text"] = output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [297]:
val["text"] = val["text"] .apply(str)
output = val["text"].apply(lambda x: clean_sentence(x))
val["text"] = output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [298]:
train

Unnamed: 0,text,aspect_words,labels
1932,Could not keep up with me and finally the hard...,hard drive,0
2178,It is easy to navigate and update programs.,navigate,2
936,"Laptops are usually used on the go, so why not...",battery,0
1546,Every driver on the drivers/applications DVD i...,driver,2
1847,Its small enough where I can take it pretty mu...,screen,2
...,...,...,...
2069,A seventy dollar mouse!,mouse,1
1253,He loves it and it is easy to use and well the...,use,2
1054,This wiped out several programs that were inst...,programs,0
168,"Yes, they cost more, but they more than make u...",speed,2


In [299]:
train.to_csv("train_sentiment_absa.csv",index=False)
val.to_csv("val_sentiment_absa.csv",index=False)


In [311]:
#restaurant data
data["labels"].value_counts()

2    1940
0     722
1     585
Name: labels, dtype: int64

In [312]:
#restaurant data

val_data["labels"].value_counts()

2    224
0     85
1     52
Name: labels, dtype: int64

## Test set

In [108]:
test_data_dir = dir_path +'/ABSA complete Dataset/ABSA Test/Laptops_Test_Gold.xml'

test_sentences,test_aspect_terms,test_unique_aspects = read_and_preprocess_data(test_data_dir)

In [110]:
input_text,target_words,labels = create_dataset(test_sentences,test_aspect_terms)

sentences 800 800


In [111]:
test_data_dict = {"text":input_text,'aspect_words':target_words,'labels':labels}

In [112]:
import pandas as pd
test = pd.DataFrame(test_data_dict)

In [113]:
test

Unnamed: 0,text,aspect_words,labels
0,"Boot time is super fast, around anywhere from ...",Boot time,positive
1,tech support would not fix the problem unless ...,tech support,negative
2,Set up was easy.,Set up,positive
3,Did not enjoy the new Windows 8 and touchscree...,Windows 8,negative
4,Did not enjoy the new Windows 8 and touchscree...,touchscreen functions,negative
...,...,...,...
633,I've had it for about 2 months now and found n...,software,neutral
634,I've had it for about 2 months now and found n...,updates,neutral
635,the latest version does not have a disc drive.,disc drive,neutral
636,Screen - although some people might complain a...,Screen,positive


In [114]:
test['labels'] = LE.transform(test['labels'])
test

Unnamed: 0,text,aspect_words,labels
0,"Boot time is super fast, around anywhere from ...",Boot time,2
1,tech support would not fix the problem unless ...,tech support,0
2,Set up was easy.,Set up,2
3,Did not enjoy the new Windows 8 and touchscree...,Windows 8,0
4,Did not enjoy the new Windows 8 and touchscree...,touchscreen functions,0
...,...,...,...
633,I've had it for about 2 months now and found n...,software,1
634,I've had it for about 2 months now and found n...,updates,1
635,the latest version does not have a disc drive.,disc drive,1
636,Screen - although some people might complain a...,Screen,2


In [115]:
test["labels"].value_counts()

2    341
1    169
0    128
Name: labels, dtype: int64

In [116]:
test["text"] = test["text"] .apply(str)
output = test["text"].apply(lambda x: clean_sentence(x))
test["text"] = output

In [118]:
list(test["text"].values)

['Boot time is super fast, around anywhere from 35 seconds to 1 minute.',
 'tech support would not fix the problem unless I bought your plan for $150 plus.',
 'Set up was easy.',
 'Did not enjoy the new Windows 8 and touchscreen functions.',
 'Did not enjoy the new Windows 8 and touchscreen functions.',
 "Other than not being a fan of click pads (industry standard these days) and the lousy internal speakers, it's hard for me to find things about this notebook I don't like, especially considering the $350 price tag.",
 "Other than not being a fan of click pads (industry standard these days) and the lousy internal speakers, it's hard for me to find things about this notebook I don't like, especially considering the $350 price tag.",
 "Other than not being a fan of click pads (industry standard these days) and the lousy internal speakers, it's hard for me to find things about this notebook I don't like, especially considering the $350 price tag.",
 'No installation disk (DVD) is included.

In [119]:
test.to_csv("test_sentiment_ABSA.csv",index=False)