This notebook computes the SL and SWN complexity scores.

This notebook is inspired by:

Lange, N. and Frasincar, F. (2020). Curriculum learning for a hybrid approach for aspect-based sentiment
analysis. Bachelorâ€™s thesis.
https://github.com/NanaLange/CL-HAABSA

# Packages

In [None]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import nltk
from nltk.wsd import lesk
from nltk.corpus import sentiwordnet as sw
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
import random

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

# Functions

Load Data

In [None]:
def tokenize_text(text):
    """
    Tokenize strings, only retaining alphanumeric characters.
    """

    tokenizer = nltk.RegexpTokenizer(r"\w+")
    return tokenizer.tokenize(text)

def count_aspect_words(target):
    """
    Count the number of words in the aspect term.
    """

    tokenizer = nltk.RegexpTokenizer(r"\w+")
    if not target or target == 'NULL':
      return 0
    return len(tokenizer.tokenize(target))

def load_data(file_path):
    """
    Load and preprocess the XML file into a pandas DataFrame.
    """

    # Load and parse XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data into a list of dictionaries
    data = []

    # Iterate through the XML structure and extract information from the reviews
    for review in root.findall('Review'):
        review_id = review.get('rid')

        for sentence in review.findall('.//sentence'):
            text = sentence.find('text').text
            sentence_length = len(tokenize_text(text))
            opinions = sentence.findall('.//Opinion')
            num_aspects = len(opinions)

            for opinion in sentence.findall('.//Opinion'):
                category = opinion.get('category')
                if category == 'FOOD#GENERAL':
                    category = 'FOOD#STYLE_OPTIONS'

                data.append({
                    "sentence": text,
                    "aspect": opinion.get('target'),
                    "category": category,
                    "sentiment": opinion.get('polarity'),
                    "num_aspects": num_aspects,
                    "tokenized_sentence": tokenize_text(text),
                    "sentence_length": sentence_length,
                    "num_context_words": sentence_length - count_aspect_words(opinion.get('target'))
                })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    y = pd.get_dummies(df['sentiment']).astype(int).values

    # Return both the features DataFrame and the sentiment labels
    return df, y

Retrieve SentiWordNet Features

In [None]:
def get_swn_scores(tokenized_sentence):
    """
    Get SentiWordNet scores for a given (tokenized) sentence.
    We return the Positivity (P), Negativity (N), Objectivity (O), and Absolute Difference scores (AD).
    """

    # Get the POS-tag for every token
    tag = nltk.pos_tag(tokenized_sentence)
    synset = []

    # Loop through every word and POS-tag and append correct synset to the list for every word
    for word, pos in tag:

      # Map the NLTK tags to SentiWordNet tags and use LESK for word sense disambiguation to find correct synset
      if pos.startswith('J'): # Adjective
        pos = 'a'
        if lesk(tokenized_sentence,word,pos) is None: # Satellite Adjective
          pos = 's'
        synset.append(lesk(tokenized_sentence, word, pos = pos))
      elif pos.startswith("R"): # Adverb
        pos = 'r'
        synset.append(lesk(tokenized_sentence, word, pos = pos))
      elif pos.startswith("N"): # Noun
        pos = 'n'
        synset.append(lesk(tokenized_sentence, word, pos = pos))
      elif pos.startswith("V"): # Verb
        pos = 'v'
        synset.append(lesk(tokenized_sentence, word, pos = pos))
      else:
        synset.append(lesk(tokenized_sentence, word))

    # Compute the SWN features for the sentence
    pos_score, neg_score, obj_score = 0, 0, 0
    for syn in synset:
      if syn is not None:
          # Get the sentiment scores for the given synset
          scores = sw.senti_synset(syn.name())
          pos_score += scores.pos_score()
          neg_score += scores.neg_score()
          obj_score += scores.obj_score()
    abs_diff_score = abs(pos_score - neg_score)

    return pos_score, neg_score, obj_score, abs_diff_score

Get feature matrix Dataframe

In [None]:
# Function to generate features based on the tokenized sentences and other extracted information
def extract_features(df):
    """
    Construct the feature matrix which we use as input for the Auxiliary feedforward neural network.
    """
    df_features = pd.DataFrame()

    # Sentence Length
    df_features['l'] = df['sentence_length'].values

    # SentiWordNet features (Positivity, Negativity, Objectivity, and Absolute Difference scores)
    df_features[['P', 'N', 'O', 'AD']] = df['tokenized_sentence'].apply(
        lambda x: pd.Series(get_swn_scores(x))
    )

    # Number of Aspects
    df_features['A'] = df['num_aspects'].values

    # Number of Context Words
    df_features['W'] = df['num_context_words'].values

    # Scaled versions of the abovementioned features
    for feature in ['P', 'N', 'O', 'AD', 'A', 'W']:
        df_features[f'{feature}_scaled'] = df_features[feature] / df_features['l']

    # Save the columns to normalize
    columns_to_normalize = df_features.columns

    # One-hot encoded categories
    one_hot_encoded = pd.get_dummies(df['category']).astype(int)
    df_features = pd.concat([df_features, one_hot_encoded], axis=1)

    # Normalize all features (except categories) to lie in the range [0,1]
    scaler = MinMaxScaler()
    df_features[columns_to_normalize] = scaler.fit_transform(df_features[columns_to_normalize])

    # Return the final DataFrame with all features
    return df_features

Auxiliary feedforward Neural Network

In [None]:
def aux_model(df_train, y_train, df_test, y_test):
    """
    Run the auxiliary feedforward neural network to obtain the complexity scores for every training instance.
    """

    # Load the feature matrices
    X_train = df_train.values
    X_test = df_test.values

    # Build the neural network
    model = Sequential()
    model.add(Dense(183, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.6))
    model.add(Dense(140, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(), bias_initializer=tf.keras.initializers.Zeros()))
    model.add(Dropout(0.6))
    model.add(Dense(3, activation='softmax'))

    # Compile the model
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    # Train the model on the training data
    history = model.fit(X_train, y_train, epochs=600, batch_size=198, verbose=0)

    # Compute the prediction probabilities, predicted class, and true class
    prob_y_train = model.predict(X_train)
    pred_y_train = np.argmax(prob_y_train, axis=1)
    y_train_labels = np.argmax(y_train,axis=1)

    # Compute the in-sample accuracy
    train_acc = accuracy_score(pred_y_train, y_train_labels)
    print('Train accuracy is: ', train_acc * 100)

    # Compute the out-of-sample accuracy
    prob_y_test = model.predict(X_test)
    pred_y_test = np.argmax(prob_y_test, axis=1)
    y_test_labels = np.argmax(y_test,axis=1)
    test_acc = accuracy_score(pred_y_test, y_test_labels)
    print('Test accuracy is: ', test_acc * 100)

    # Compute the complexity scores
    complexity_scores = []
    for prob, label in zip(prob_y_train, y_train):
      score = np.sum(np.square(np.subtract(prob, label)))
      complexity_scores.append(score)

    return complexity_scores


In [None]:
def averaged_curriculum_scores(df_train, y_train, df_test, y_test, num_runs):
    """
    #Run aux model num_runs times and get averaged curriculum scores to mitigate initialisation sensitivity
    """
    scores_list = []

    for i in range(num_runs):
      random.seed(i)
      np.random.seed(i)
      tf.random.set_seed(i)

      scores = aux_model(df_train, y_train, df_test, y_test)
      scores_list.append(scores)

    averaged_scores = np.mean(scores_list, axis=0)
    return averaged_scores

# Main

## Load Files

To run this code, first upload the files:

*   '2015_Restaurants_Train.xml'
*   '2015_Restaurants_Test.xml'
*   '2016_Restaurants_Train.xml'
*   '2016_Restaurants_Test.xml'

## 2015

In [None]:
#Load 2015 train and test data
df_train_2015, y_train_2015 = load_data("2015_Restaurants_Train.xml")
df_feature_train_2015 = extract_features(df_train_2015)

df_test_2015, y_test_2015 = load_data("2015_Restaurants_Test.xml")
df_feature_test_2015 = extract_features(df_test_2015)

In [None]:
#Get SL and SWN complexity scores
sentence_lengths_2015 = df_feature_train_2015['l'].values
swn_complexity_scores_2015 = averaged_curriculum_scores(df_feature_train_2015, y_train_2015, df_feature_test_2015, y_test_2015, num_runs = 10)

# Save the complexity scores to csv files
pd.Series(sentence_lengths_2015).to_csv('sentence_lengths_2015.csv', index=False)
pd.Series(swn_complexity_scores_2015).to_csv('swn_complexity_scores_2015.csv', index=False)

## 2016

In [None]:
#Load 2016 train and test data
df_train_2016, y_train_2016 = load_data("2016_Restaurants_Train.xml")
df_feature_train_2016 = extract_features(df_train_2016)

df_test_2016, y_test_2016 = load_data("2016_Restaurants_Test.xml")
df_feature_test_2016 = extract_features(df_test_2016)

In [None]:
#Get SL and SWN complexity scores
sentence_lengths_2016 = df_feature_train_2016['l'].values
swn_complexity_scores_2016 = averaged_curriculum_scores(df_feature_train_2016, y_train_2016, df_feature_test_2016, y_test_2016,10)

# Save the complexity scores to csv files
pd.Series(sentence_lengths_2016).to_csv('sentence_lengths_2016.csv', index=False)
pd.Series(swn_complexity_scores_2016).to_csv('swn_complexity_scores_2016.csv', index=False)