# Import

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
!pip install torch==1.4.0
import torch
print(torch.__version__)

1.4.0


In [36]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

In [37]:
!pip install stanfordnlp
import stanfordnlp
stanfordnlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /root/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:40<00:00, 5.83MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [38]:
from xml.etree import cElementTree as ET
import pandas as pd
import string
import spacy
import re
import random
import sklearn
from sklearn.model_selection import train_test_split
import csv
import numpy as np

In [39]:
root_dir = "/content/drive/MyDrive/NLP Project/Topic 5/"
train_folder = "Train/SemEval'14-ABSA-TrainData_v2 & AnnotationGuidelines/"
laptop_train_file = "Laptop_Train_v2.xml"
restaurant_train_file = "Restaurants_Train_v2.xml"
test_1_folder = "Test1/ABSA_TestData_PhaseA/ABSA_TestData_PhaseA/"
laptop_test_1_file = "Laptops_Test_Data_PhaseA.xml"
restaurant_test_1_file = "Restaurants_Test_Data_PhaseA.xml"
test_2_folder = "Test2/ABSA_TestData_PhaseB/"
laptop_test_2_file = "Laptops_Test_Data_phaseB.xml"
restaurant_test_2_file = "Restaurants_Test_Data_phaseB.xml"

# Data Extraction & Cleaning

In [40]:
class AspectWord():
    def __init__(self, aspect_term, sentiment_terms, polarity = None):
        # Words
        self.aspect_term = aspect_term
        # List of Words
        self.sentiment_terms = sentiment_terms
        # Polarity: +,-,neutral
        self.polarity = polarity

    def __str__(self):
        return f"Aspect Term: {self.aspect_term} Sentiment Terms: {self.sentiment_terms} Polarity: {self.polarity}"
    
    def __repr__(self):
        return self.__str__()

In [41]:
class AspectCategory():
    def __init__(self, aspect_category, polarity = None):
        # Words
        self.aspect_category = aspect_category
        # Polarity: +,-,neutral
        self.polarity = polarity

    def __str__(self):
        return f"Aspect Category: {self.aspect_category} Polarity: {self.polarity}"
    
    def __repr__(self):
        return self.__str__()

In [42]:
class Sentence():
    def __init__(self, sentence_id, sentence, data_type = "Train",
                 actual_sentence_id = None, actual_aspect_words = [], actual_aspect_categories = None): 
        # Int
        self.sentence_id = sentence_id
        # String
        self.sentence = sentence
        # String
        self.data_type = data_type
        # String
        self.actual_sentence_id = actual_sentence_id
        # List of AspectWord
        self.actual_aspect_words = actual_aspect_words
        # List of AspectWord
        self.generated_aspect_words = []
        # List of AspectCategory
        self.actual_aspect_categories = actual_aspect_categories
        # Polarity
        self.actual_sentence_polarity = None
        # Polarity
        self.generated_sentence_polarity = None
        
    def __str__(self):
        if self.data_type == "Train":
            return f"ID: {self.sentence_id} Sentence: {self.sentence} {self.actual_aspect_words} generated polarity: {self.generated_sentence_polarity}"
        else:
            return f"ID: {self.sentence_id} Sentence: {self.sentence} {self.generated_aspect_words} generated polarity: {self.generated_sentence_polarity}"
    
    def __repr__(self):
        return self.__str__()

    def calculate_polarity(self):
        temp_polarity = 0
        for actual_aspect_word in self.actual_aspect_words:
          if(actual_aspect_word.polarity == "positive"):
            temp_polarity+=1
          elif(actual_aspect_word.polarity == "negative"):
            temp_polarity-=1
        if(temp_polarity > 0):
          self.actual_sentence_polarity = "positive"
        elif(temp_polarity < 0):
          self.actual_sentence_polarity = "negative"
        else:
          self.actual_sentence_polarity = "neutral"    

In [97]:
def data_list_to_df(data_list, sentence_polarity = "actual", aspect_terms = True, no_of_rows = 5):
    
    data = pd.DataFrame(columns = ["ID","Sentence","Actual Aspect Terms","Actual Polarities","Actual Sentence Polarity","Generated Sentence Polarity"])
    
    for i in range(min(len(data_list), no_of_rows)):
        data.loc[len(data.index)] = [data_list[i].sentence_id, data_list[i].sentence, [x.aspect_term for x in data_list[i].actual_aspect_words], [x.polarity for x in data_list[i].actual_aspect_words], data_list[i].actual_sentence_polarity, data_list[i].generated_sentence_polarity] 
    
    if sentence_polarity == "actual":
        data.drop(["Generated Sentence Polarity"], axis = 1, inplace = True)
    elif sentence_polarity == "generated":
        data.drop(["Actual Sentence Polarity"], axis = 1, inplace = True)
    elif sentence_polarity == "none":
        data.drop(["Actual Sentence Polarity", "Generated Sentence Polarity"], axis = 1, inplace = True)

    if not aspect_terms:
        data.drop(["Actual Aspect Terms","Actual Polarities"], axis = 1, inplace = True)

    return data

In [44]:
def xml_to_sentences(path,data_type = "Train"):
    data = []
    tree = ET.parse(path)
    root = tree.getroot()
    id = 1
    for page in root.findall('sentence'):
        sentence_id = id
        sentence = page[0].text
        actual_sentence_id = page.attrib["id"]
        actual_aspect_terms = []
        actual_polarity = []
        if (data_type == "Train"):
            if len(page)> 1 and page[1].tag == "aspectTerms":
                aspect_terms_data = [x.attrib for x in page[1]]
                aspect_categories_data = None
                
                # Ignore Sentences without aspect terms if training data
                aspect_words = []
                for x in aspect_terms_data:
                    aspect_words.append(AspectWord(x['term'],[],x['polarity']))

                aspect_categories = []
                if len(page) > 2:
                    aspect_categories_data = [x.attrib for x in page[2]]
                    for x in aspect_categories_data:
                        aspect_categories.append(AspectWord(x['category'],x['polarity']))
                id += 1
                curr_sentence = Sentence(sentence_id, sentence, data_type, actual_sentence_id, aspect_words, aspect_categories)
                curr_sentence.calculate_polarity()
                data.append(curr_sentence)
        elif (data_type == "Test"):
            curr_sentence = Sentence(sentence_id, sentence, data_type, actual_sentence_id)
            curr_sentence.calculate_polarity()
            data.append(curr_sentence)
            id += 1
        else:
            print("Incorrect Data Type")
            return None
    return data

# Train the Model

In [45]:
nlp = stanfordnlp.Pipeline()

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand

In [46]:
def pos_tagger(text):
    doc = nlp(text)
    pos_tags = {}
    for sent in doc.sentences:
        for wrd in sent.words:
            pos_tags[wrd.text] = wrd.pos
            
    return pos_tags

In [47]:
def train(sentences):
    adjective_terms_frequency = {}
    for i in range(len(sentences)):
        pos_tags = pos_tagger(sentences[i].sentence)
        for word, pos_tag in pos_tags.items():
            if pos_tag in ["JJ", "JJR", "JJS"]:
                if sentences[i].actual_sentence_polarity == 'positive':
                    polarity = 1
                elif sentences[i].actual_sentence_polarity == 'negative':
                    polarity = -1
                else:
                    polarity = 0
                if word in list(adjective_terms_frequency.keys()): 
                    adjective_terms_frequency[word]['frequency'] += 1
                    adjective_terms_frequency[word]['polarity'] += (polarity)
                else:
                    adjective_terms_frequency[word] = {'frequency':1, 'polarity':polarity}
    
    for key in list(adjective_terms_frequency.keys()):
        adjective_terms_frequency[key]['polarity'] = (adjective_terms_frequency[key]['polarity']/adjective_terms_frequency[key]['frequency'])

    return adjective_terms_frequency

# Test the Model

 

In [48]:
def predict(data, adjectives_terms_frequency):
    unk_words = set()
    for i in range(len(data)):
        sentence = data[i].sentence
        unk = 0
        polarity = 0
        adj_terms=0
        word_list = sentence.split()
        for term in word_list:
            if term not in adjectives_terms_frequency.keys():
                unk_words.add(term)
                unk+=1
            else:
                adj_terms+=1
                polarity += adjectives_terms_frequency[term]['polarity']
        if (adj_terms - unk!= 0):
            data[i].generated_sentence_polarity = polarity/(adj_terms-unk)
        else:
            data[i].generated_sentence_polarity = 0
    return len(unk_words)

def find_polarity(value, threshold = 0):
    if (value < -1*threshold):
        return "negative"
    elif (value > threshold):
        return "positive"
    else:
        return "neutral"

def classify(data, adjectives_terms_frequency):
    unk_words = predict(data, adjectives_terms_frequency)
    for i in range(len(data)):
        data[i].generated_sentence_polarity = find_polarity(data[i].generated_sentence_polarity)
    return unk_words

# Performance of Sentence Polarity Classification

In [49]:
def sentence_polarity_classification_performance(data): 
    matches = 0
    non_matches = 0
    for i in range(len(data)):
        if(data[i].actual_sentence_polarity == data[i].generated_sentence_polarity):
            matches+=1
        else:
            non_matches+=1

    return {"Sentence Polarity Matches":matches, "Sentence Polarity Non Matches":non_matches, "Sentence Polarity Classification Precision":matches/(matches+non_matches)}

# Execution

## Load Data

### Laptop

In [75]:
laptop_train_data = xml_to_sentences(root_dir + train_folder + laptop_train_file)
print(f"Total sentences: {len(laptop_train_data)}")
laptop_train_data_df = data_list_to_df(laptop_train_data, sentence_polarity="actual")
laptop_train_data_df.head()

Total sentences: 1488


Unnamed: 0,ID,Sentence,Actual Aspect Terms,Actual Polarities,Actual Sentence Polarity
0,1,I charge it at night and skip taking the cord ...,"[cord, battery life]","[neutral, positive]",positive
1,2,The tech guy then said the service center does...,"[service center, ""sales"" team, tech guy]","[negative, negative, neutral]",negative
2,3,"it is of high quality, has a killer GUI, is ex...","[quality, GUI, applications, use]","[positive, positive, positive, positive]",positive
3,4,Easy to start up and does not overheat as much...,[start up],[positive],positive
4,5,"I even got my teenage son one, because of the ...","[features, iChat, Photobooth, garage band]","[positive, positive, positive, positive]",positive


In [51]:
random.seed(7)
np.random.seed(7)
# Split into test and train
laptop_train_data, laptop_valid_data = sklearn.model_selection.train_test_split(laptop_train_data, test_size=0.1)
print(f"Size of Training Data: {len(laptop_valid_data)}\nSize of Validation Data: {len(laptop_train_data)}")

Size of Training Data: 149
Size of Validation Data: 1339


### Restaurant

In [76]:
restaurant_train_data = xml_to_sentences(root_dir + train_folder + restaurant_train_file)
print(f"Total sentences: {len(restaurant_train_data)}")
restaurant_train_data_df = data_list_to_df(restaurant_train_data, sentence_polarity="actual")
restaurant_train_data_df.head()

Total sentences: 2021


Unnamed: 0,ID,Sentence,Actual Aspect Terms,Actual Polarities,Actual Sentence Polarity
0,1,But the staff was so horrible to us.,[staff],[negative],negative
1,2,"To be completely fair, the only redeeming fact...",[food],[positive],positive
2,3,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]","[positive, positive, neutral]",positive
3,4,"Not only was the food outstanding, but the lit...","[food, perks]","[positive, positive]",positive
4,5,Our agreed favorite is the orrechiete with sau...,"[orrechiete with sausage and chicken, waiters,...","[positive, positive, neutral, neutral]",positive


In [54]:
random.seed(7)
np.random.seed(7)
# Split into test and train
restaurant_train_data, restaurant_valid_data = sklearn.model_selection.train_test_split(restaurant_train_data, test_size=0.1)
print(f"Size of Training Data: {len(restaurant_train_data)}\nSize of Validation Data: {len(restaurant_valid_data)}")

Size of Training Data: 1636
Size of Validation Data: 182


## Training

### Laptop

In [78]:
laptop_adj_freq = train(laptop_train_data)
df = pd.DataFrame.from_dict(laptop_adj_freq, orient = 'index', columns =["Polarity"])
df.head()

Unnamed: 0,Polarity


### Restaurant

In [79]:
restaurant_adj_freq = train(restaurant_train_data)
df = pd.DataFrame.from_dict(restaurant_adj_freq, orient = 'index', columns =["Polarity"])
df.head()

Unnamed: 0,Polarity


## Validation

### Laptop

In [77]:
laptop_valid_data_predictions_unk_count = classify(laptop_valid_data, laptop_adj_freq)
print("Unknown Sentiment Terms:", laptop_valid_data_predictions_unk_count)
laptop_valid_data_df = data_list_to_df(laptop_valid_data, sentence_polarity="both")
laptop_valid_data_df.head()

Unknown Sentiment Terms: 922


Unnamed: 0,ID,Sentence,Actual Aspect Terms,Actual Polarities,Actual Sentence Polarity,Generated Sentence Polarity
0,433,"WHEN TYPING, LETTERS AND SPACES ARE FREQUENTLY...",[TYPING],[negative],negative,neutral
1,59,I love the glass touchpad.,[glass touchpad],[positive],positive,neutral
2,1021,Later it held zero charge and its replacemen...,[charge],[negative],negative,positive
3,678,This computer is exceptionally thin for it's s...,"[screen size, processing power]","[positive, positive]",positive,negative
4,1319,It has just enough RAM to run smoothly and eno...,"[RAM, memory]","[positive, positive]",positive,negative


In [63]:
laptop_valid_data_ratios = sentence_polarity_classification_performance(laptop_valid_data)
pd.DataFrame.from_dict(laptop_valid_data_ratios, orient = 'index',columns = ["Parameters"])

Unnamed: 0,Parameters
Sentence Polarity Matches,26.0
Sentence Polarity Non Matches,123.0
Sentence Polarity Classification Precision,0.174497


### Restaurant

In [80]:
restaurant_valid_data_predictions_unk_count = classify(restaurant_valid_data, restaurant_adj_freq)
print("Unknown Sentiment Terms:", restaurant_valid_data_predictions_unk_count)
restaurant_valid_data_df = data_list_to_df(restaurant_valid_data, sentence_polarity = "both")
restaurant_valid_data_df.head()

Unknown Sentiment Terms: 1001


Unnamed: 0,ID,Sentence,Actual Aspect Terms,Actual Polarities,Actual Sentence Polarity,Generated Sentence Polarity
0,1531,The service is descent even when this small pl...,"[service, place]","[positive, negative]",neutral,positive
1,768,"I didn't complain, I liked the atmosphere so m...",[atmosphere],[positive],positive,neutral
2,758,Awsome Pizza especially the Margheritta slice.,"[Pizza, Margheritta]","[positive, positive]",positive,negative
3,380,I think the stuff was better than Disney.,[stuff],[positive],positive,negative
4,1829,love the food.,[food],[positive],positive,neutral


In [65]:
restaurant_valid_data_ratios = sentence_polarity_classification_performance(restaurant_valid_data)
pd.DataFrame.from_dict(restaurant_valid_data_ratios, orient = 'index',columns = ["Parameters"])

Unnamed: 0,Parameters
Sentence Polarity Matches,37.0
Sentence Polarity Non Matches,145.0
Sentence Polarity Classification Precision,0.203297


## Testing

### Laptop

#### Test-1 Data

In [98]:
laptop_test_1_data = xml_to_sentences(root_dir + test_1_folder + laptop_test_1_file, data_type = "Test")
laptop_test_1_data_df = data_list_to_df(laptop_test_1_data, sentence_polarity = "none", aspect_terms = False)
laptop_test_1_data_df.head()

Unnamed: 0,ID,Sentence
0,1,"Boot time is super fast, around anywhere from ..."
1,2,tech support would not fix the problem unless ...
2,3,but in resume this computer rocks!
3,4,Set up was easy.
4,5,Did not enjoy the new Windows 8 and touchscree...


In [99]:
laptop_test_1_predictions_unk_count = classify(laptop_test_1_data, laptop_adj_freq)
print("Number of Unknown Sentiment Terms:", laptop_test_1_predictions_unk_count)
laptop_test_1_data_df = data_list_to_df(laptop_test_1_data, sentence_polarity = "generated", aspect_terms = False)
laptop_test_1_data_df.head()

Number of Unknown Sentiment Terms: 2530


Unnamed: 0,ID,Sentence,Generated Sentence Polarity
0,1,"Boot time is super fast, around anywhere from ...",neutral
1,2,tech support would not fix the problem unless ...,positive
2,3,but in resume this computer rocks!,neutral
3,4,Set up was easy.,neutral
4,5,Did not enjoy the new Windows 8 and touchscree...,neutral


#### Test-2 Data

In [100]:
laptop_test_2_data = xml_to_sentences(root_dir + test_2_folder + laptop_test_2_file, data_type = "Test")
laptop_test_2_data_df = data_list_to_df(laptop_test_2_data, sentence_polarity = "none", aspect_terms = False)
laptop_test_2_data_df.head()

Unnamed: 0,ID,Sentence
0,1,"Boot time is super fast, around anywhere from ..."
1,2,tech support would not fix the problem unless ...
2,3,but in resume this computer rocks!
3,4,Set up was easy.
4,5,Did not enjoy the new Windows 8 and touchscree...


In [102]:
laptop_test_2_predictions_unk_count = classify(laptop_test_2_data, laptop_adj_freq)
print("Number of Unknown Sentiment Terms:", laptop_test_2_predictions_unk_count)
laptop_test_2_data_df = data_list_to_df(laptop_test_2_data, sentence_polarity = "generated", aspect_terms = False)
laptop_test_2_data_df.head()

Number of Unknown Sentiment Terms: 2530


Unnamed: 0,ID,Sentence,Generated Sentence Polarity
0,1,"Boot time is super fast, around anywhere from ...",neutral
1,2,tech support would not fix the problem unless ...,positive
2,3,but in resume this computer rocks!,neutral
3,4,Set up was easy.,neutral
4,5,Did not enjoy the new Windows 8 and touchscree...,neutral


### Restaurant

#### Test-1 Data

In [103]:
restaurant_test_1_data = xml_to_sentences(root_dir + test_1_folder + restaurant_test_1_file, data_type = "Test")
restaurant_test_1_data_df = data_list_to_df(restaurant_test_1_data, sentence_polarity = "none", aspect_terms = False)
restaurant_test_1_data_df.head()

Unnamed: 0,ID,Sentence
0,1,The bread is top notch as well.
1,2,I have to say they have one of the fastest del...
2,3,Food is always fresh and hot- ready to eat!
3,4,Did I mention that the coffee is OUTSTANDING?
4,5,"Certainly not the best sushi in New York, howe..."


In [104]:
restaurant_test_1_predictions_unk_count = classify(restaurant_test_1_data, restaurant_adj_freq)
print("Number of Unknown Sentiment Terms:", restaurant_test_1_predictions_unk_count)
restaurant_test_1_data_df = data_list_to_df(restaurant_test_1_data, sentence_polarity = "generated", aspect_terms = False)
restaurant_test_1_data_df.head()

Number of Unknown Sentiment Terms: 2924


Unnamed: 0,ID,Sentence,Generated Sentence Polarity
0,1,The bread is top notch as well.,negative
1,2,I have to say they have one of the fastest del...,negative
2,3,Food is always fresh and hot- ready to eat!,negative
3,4,Did I mention that the coffee is OUTSTANDING?,neutral
4,5,"Certainly not the best sushi in New York, howe...",positive


#### Test-2 Data

In [105]:
restaurant_test_2_data = xml_to_sentences(root_dir + test_2_folder + restaurant_test_2_file, data_type = "Test")
restaurant_test_2_data_df = data_list_to_df(restaurant_test_2_data, sentence_polarity = "none", aspect_terms = False)
restaurant_test_2_data_df.head()

Unnamed: 0,ID,Sentence
0,1,The bread is top notch as well.
1,2,I have to say they have one of the fastest del...
2,3,Food is always fresh and hot- ready to eat!
3,4,Did I mention that the coffee is OUTSTANDING?
4,5,"Certainly not the best sushi in New York, howe..."


In [106]:
restaurant_test_2_predictions_unk_count = classify(restaurant_test_2_data, restaurant_adj_freq)
print("Number of Unknown Sentiment Terms:", restaurant_test_2_predictions_unk_count)
restaurant_test_2_data_df = data_list_to_df(restaurant_test_2_data, sentence_polarity = "generated", aspect_terms = False)
restaurant_test_2_data_df.head()

Number of Unknown Sentiment Terms: 2924


Unnamed: 0,ID,Sentence,Generated Sentence Polarity
0,1,The bread is top notch as well.,negative
1,2,I have to say they have one of the fastest del...,negative
2,3,Food is always fresh and hot- ready to eat!,negative
3,4,Did I mention that the coffee is OUTSTANDING?,neutral
4,5,"Certainly not the best sushi in New York, howe...",positive


# Shortcomings and Improvements

* The model as expected has bad precision and recall, because we are calculating polarities for adjectives indiscriminately, without looking at their relevence.
* Average Sentiment addition for polarity will be harmful if sentence has many adjectives.

