In [38]:
import os
from collections import Counter
import xml.etree.ElementTree as ET

dir_path = os.path.dirname(os.path.realpath("__file__"))

train_data_dir = dir_path +'/ABSA complete Dataset/ABSA Train/Laptops_Train.xml'

def read_and_preprocess_data(data_directory):
    tree = ET.parse(data_directory)
    root = tree.getroot()
    sentences, aspect_terms = [],[]
    unique_aspect_terms = set(list())
    sentence_iterator = root.iter('sentence')
    for index, sentence in enumerate(sentence_iterator):
        sentence_text = sentence.find('text').text
        aspect_term = list()
        for aspect in sentence.iter('aspectTerm'):
            term = aspect.get('term')
            unique_aspect_terms.add(term)
            sentiment = aspect.get('polarity')
            if sentiment.lower()!="conflict":
                aspect_term.append((term,sentiment))
        aspect_terms.append(aspect_term)
        sentences.append(sentence_text)
    return sentences,aspect_terms,unique_aspect_terms
    


In [40]:
sentences,aspect_terms,unique_aspects = read_and_preprocess_data(train_data_dir)

In [70]:
sentences[3047]

'I think I might rather suffer for something that is simple to fix in my opinion.'

In [42]:
aspect_terms[0]

[('cord', 'neutral'), ('battery life', 'positive')]

In [44]:
index_aspect_mapping = {index:value for index,value in enumerate(unique_aspects)}

In [45]:
index_label_mapping = {index:value for index,value in enumerate(['negative','neutral','positive'])}
label_index_mapping = {value:index for index,value in enumerate(['negative','neutral','positive'])}

In [46]:
index_label_mapping

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [47]:
label_index_mapping

{'negative': 0, 'neutral': 1, 'positive': 2}

In [69]:
len(sentences)

3048

In [81]:
def create_dataset(sentences,aspect_terms_sentiment):
    print("sentences",len(sentences),len(aspect_terms_sentiment))
    input_instances = []
    target_words =[]
    labels = []
    for index,aspect_sentiment_pairs in enumerate(aspect_terms_sentiment):
#         print(aspect_sentiment_pairs)
        for aspect_senti in aspect_sentiment_pairs:
#             print("index",index)
            input_instances.append(sentences[index])
            target_words.append(aspect_senti[0])
            labels.append(aspect_senti[1])
    return input_instances,target_words,labels
        
    

In [82]:
input_text,target_words,labels = create_dataset(sentences,aspect_terms)

sentences 3048 3048


In [83]:
data_dict = {"text":input_text,'aspect_words':target_words,'labels':labels}

In [84]:
import pandas as pd
dataset = pd.DataFrame(data_dict)
dataset

Unnamed: 0,text,aspect_words,labels
0,I charge it at night and skip taking the cord ...,cord,neutral
1,I charge it at night and skip taking the cord ...,battery life,positive
2,The tech guy then said the service center does...,service center,negative
3,The tech guy then said the service center does...,"""sales"" team",negative
4,The tech guy then said the service center does...,tech guy,neutral
...,...,...,...
2323,We also use Paralles so we can run virtual mac...,Windows 7 Home Premium,neutral
2324,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral
2325,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral
2326,"How Toshiba handles the repair seems to vary, ...",repair,positive


In [86]:
from sklearn.model_selection import train_test_split
train, val= train_test_split(dataset, train_size = 0.90)


In [88]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
train['labels'] = LE.fit_transform(train['labels'])
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,aspect_words,labels
1591,"Also, I have had alot of trouble with the sh...",shift key,0
1872,"The computer runs extremely slowly, whether op...",runs,0
351,HOW DOES THE POWER SUPPLY NOT WORK!!!,POWER SUPPLY,0
1118,"I made a photo book as a gift, on my computer,...",iTunes,1
1440,That included the extra Sony Sonic Stage softw...,extended life battery,1
...,...,...,...
498,I love the dock where I can simply drop a file...,program,1
1119,"I use this for my tutoring business, and since...",portability,2
1588,Later it held zero charge and its replacemen...,charge,0
2176,You can call HP and they want you to buy more ...,software,1


In [89]:
val['labels'] = LE.transform(val['labels'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [90]:
val

Unnamed: 0,text,aspect_words,labels
747,The case is carved out of a single block of al...,case,1
1094,"I bought a cordless mouse for it, but don't al...",cordless mouse,2
1811,Toshiba customer services will indirectly de...,Toshiba customer services,0
425,"Three weeks after I bought the netbook, the sc...",screen,0
941,Keyboard is great but primary and secondary co...,Keyboard,2
...,...,...,...
1143,What I'd like is for the laptop to run well wi...,memory,0
629,If you don't feel comfortable doing it yoursel...,case,2
1859,It was hard to handle and operate at school.,operate,0
81,It's so much easier to navigate through the op...,operating system,2


In [91]:
def get_labels(prediction):
    predicted_label =  LE.inverse_transform([prediction])
    return predicted_label[0]

In [94]:
get_labels(0)

'negative'

In [95]:
import joblib
joblib.dump(LE,'sentiment_label_encoder')

['sentiment_label_encoder']

In [96]:
train['labels'].value_counts()


2    903
0    771
1    421
Name: labels, dtype: int64

In [97]:
val['labels'].value_counts()

0    99
2    91
1    43
Name: labels, dtype: int64

In [100]:
list(train['text'].values)

['\xa0 Also, I have had alot of trouble with the shift key to go to other lines.',
 'The computer runs extremely slowly, whether opening Word or My Computer.',
 'HOW DOES THE POWER SUPPLY NOT WORK!!!',
 'I made a photo book as a gift, on my computer, pushed "Buy" and it drew from my iTunes account and sent the book to my house, the book was of perfect quality-exactly how I had created it and looked better than I had even imagined.',
 'That included the extra Sony Sonic Stage software, the speakers and the subwoofer I got (that WAS worth the money), the bluetooth mouse for my supposedly bluetooth enabled computer, the extended life battery and the Docking port.',
 'not using wired lan not sure what thats about.',
 'The internet was locekd and froze every time it was trying to be used, and the command prompt would not work at all.',
 'The touchpad is extremely sensitive, which is the only drawback.',
 'I had to re-install Windows within two weeks of the purchase and soon discovered crack

In [103]:
import re
def clean_sentence(review):
  review = re.sub('<[^>]*>', ' ',review)
  review = re.sub('�', ' ', review)
  review = re.sub('https?:[a-z\S]+','',review)
  review = re.sub('\xa0','',review)
  review = re.sub('_','',review)
  review = re.sub(u'\u2004','',review)
  review = re.sub(u'\u2009','',review)

  review = re.sub('&nbsp','',review)
  review = re.sub('&ndash','',review)
  review = re.sub('\r','',review)
  review = re.sub('\t','',review)
  review = re.sub('\n',' ',review)
 
  review = re.sub('&thinsp','',review)
  review = re.sub('&times','',review)
  review = re.sub('\u200b','',review)
  review = re.sub('&rarr;;;','',review)
 
  return review

In [104]:
train["text"] = train["text"] .apply(str)
output = train["text"].apply(lambda x: clean_sentence(x))
train["text"] = output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [105]:
val["text"] = val["text"] .apply(str)
output = val["text"].apply(lambda x: clean_sentence(x))
val["text"] = output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [106]:
train

Unnamed: 0,text,aspect_words,labels
1591,"Also, I have had alot of trouble with the shi...",shift key,0
1872,"The computer runs extremely slowly, whether op...",runs,0
351,HOW DOES THE POWER SUPPLY NOT WORK!!!,POWER SUPPLY,0
1118,"I made a photo book as a gift, on my computer,...",iTunes,1
1440,That included the extra Sony Sonic Stage softw...,extended life battery,1
...,...,...,...
498,I love the dock where I can simply drop a file...,program,1
1119,"I use this for my tutoring business, and since...",portability,2
1588,Later it held zero charge and its replacement...,charge,0
2176,You can call HP and they want you to buy more ...,software,1


In [107]:
train.to_csv("train_sentiment_absa.csv",index=False)
val.to_csv("val_sentiment_absa.csv",index=False)


## Test set

In [108]:
test_data_dir = dir_path +'/ABSA complete Dataset/ABSA Test/Laptops_Test_Gold.xml'

test_sentences,test_aspect_terms,test_unique_aspects = read_and_preprocess_data(test_data_dir)

In [110]:
input_text,target_words,labels = create_dataset(test_sentences,test_aspect_terms)

sentences 800 800


In [111]:
test_data_dict = {"text":input_text,'aspect_words':target_words,'labels':labels}

In [112]:
import pandas as pd
test = pd.DataFrame(test_data_dict)

In [113]:
test

Unnamed: 0,text,aspect_words,labels
0,"Boot time is super fast, around anywhere from ...",Boot time,positive
1,tech support would not fix the problem unless ...,tech support,negative
2,Set up was easy.,Set up,positive
3,Did not enjoy the new Windows 8 and touchscree...,Windows 8,negative
4,Did not enjoy the new Windows 8 and touchscree...,touchscreen functions,negative
...,...,...,...
633,I've had it for about 2 months now and found n...,software,neutral
634,I've had it for about 2 months now and found n...,updates,neutral
635,the latest version does not have a disc drive.,disc drive,neutral
636,Screen - although some people might complain a...,Screen,positive


In [114]:
test['labels'] = LE.transform(test['labels'])
test

Unnamed: 0,text,aspect_words,labels
0,"Boot time is super fast, around anywhere from ...",Boot time,2
1,tech support would not fix the problem unless ...,tech support,0
2,Set up was easy.,Set up,2
3,Did not enjoy the new Windows 8 and touchscree...,Windows 8,0
4,Did not enjoy the new Windows 8 and touchscree...,touchscreen functions,0
...,...,...,...
633,I've had it for about 2 months now and found n...,software,1
634,I've had it for about 2 months now and found n...,updates,1
635,the latest version does not have a disc drive.,disc drive,1
636,Screen - although some people might complain a...,Screen,2


In [115]:
test["labels"].value_counts()

2    341
1    169
0    128
Name: labels, dtype: int64

In [116]:
test["text"] = test["text"] .apply(str)
output = test["text"].apply(lambda x: clean_sentence(x))
test["text"] = output

In [118]:
list(test["text"].values)

['Boot time is super fast, around anywhere from 35 seconds to 1 minute.',
 'tech support would not fix the problem unless I bought your plan for $150 plus.',
 'Set up was easy.',
 'Did not enjoy the new Windows 8 and touchscreen functions.',
 'Did not enjoy the new Windows 8 and touchscreen functions.',
 "Other than not being a fan of click pads (industry standard these days) and the lousy internal speakers, it's hard for me to find things about this notebook I don't like, especially considering the $350 price tag.",
 "Other than not being a fan of click pads (industry standard these days) and the lousy internal speakers, it's hard for me to find things about this notebook I don't like, especially considering the $350 price tag.",
 "Other than not being a fan of click pads (industry standard these days) and the lousy internal speakers, it's hard for me to find things about this notebook I don't like, especially considering the $350 price tag.",
 'No installation disk (DVD) is included.

In [119]:
test.to_csv("test_sentiment_ABSA.csv",index=False)