This notebook retrieves the k most similar training instances, for every test instance, by performing KATE.

The notebook is inspired by:

Liu, J., Shen, D., Zhang, Y., Dolan, B., Carin, L., and Chen, W. (2022). What makes good in-context
examples for GPT-3? In 3rd Workshop on Knowledge Extraction and Integration for Deep Learning
Architectures (DeeLIO 2022), pages 100–114. ACL.
https://github.com/jiachangliu/KATEGPT3

#Packages

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from datasets import Dataset
import os
import json
import xml.etree.ElementTree as ET
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#Functions

Load the Data

In [None]:
def load_data(file_path):
    """
    Load and preprocess the XML file into a pandas DataFrame.
    """

    # Load and parse XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data into a list of dictionaries
    data = []

    # Iterate through the XML structure and extract information form the reviews
    for review in root.findall('Review'):
        review_id = review.get('rid')

        for sentence in review.findall('.//sentence'):
            text = sentence.find('text').text

            for opinion in sentence.findall('.//Opinion'):
                aspect = opinion.get('target')
                if aspect == 'NULL':
                  aspect = ''

                # Represent categories as aspect "entity category" for better to focus on the words during similarity computation.
                category = opinion.get('category').lower().replace('#', ' ').replace('_',' ')
                if category == 'food general':
                    category = 'food style options'

                data.append({
                    "sentence": text,
                    "aspect": aspect,
                    "category": category,
                    "sentiment": opinion.get('polarity')
                })

    # Convert to DataFrame
    df = pd.DataFrame(data)

    return df

Embed the data

In [None]:
# Load the pre-trained SBERT model
sbert_model = SentenceTransformer('all-mpnet-base-v2')

def get_sbert_embedding(text):
    """
    Generate a sentence embedding using SBERT.
    """

    return sbert_model.encode(text, convert_to_numpy=True, normalize_embeddings=True)

def embed_data(df):
    """
    Embed all elements in the DataFrame using SBERT.
    """

    # Create a list to store embeddings
    embedded_sentences = []
    embedded_categories = []
    embedded_terms = []

    # Loop through each row in the DataFrame
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Embedding entries with SBERT"):

        sentence = row['sentence']
        category = row['category']
        aspect = row['aspect']

        # Embed the sentences, aspect categories and aspect terms
        sentence_embedding = get_sbert_embedding(sentence)
        category_embedding = get_sbert_embedding(category)
        aspect_embedding = get_sbert_embedding(aspect)

        # Append the embeddings to the lists
        embedded_sentences.append(sentence_embedding)
        embedded_categories.append(category_embedding)
        embedded_terms.append(aspect_embedding)

    # Add the embeddings as a new column
    df['sentence_embedding'] = embedded_sentences
    df['category_embedding'] = embedded_categories
    df['aspect_embedding'] = embedded_terms

KATE Algorithm

In [None]:
def KATE(df_train, df_test, k):
    """
    Retrieve the k most similar instance for each test instance using HKATE.
    """

    # Create a list to store indices
    list_of_indices = []
    i = 0

    # Load the embeddings of the training data
    sentences_train = list(df_train['sentence_embedding'])
    text_sentences_train = list(df_train['sentence'])

    # Loop through each row in the DataFrame
    for _, test_row in df_test.iterrows():

      test_sentence = test_row['sentence_embedding']

      # For every element of the test instance, compute the cosine similarities with the training data
      sentence_similarities = cosine_similarity([test_sentence], sentences_train)[0]

      # Order the training data based on similarity to the test instance (negative to sort on similarity in descending order)
      similarities = [(text_sentences_train[i], sentence_similarities[i], i) for i in range(len(text_sentences_train))]
      similarities.sort(key=lambda x: (-x[1], x[2])) #negative for descending order

      unique_sentences = {}
      top_k_results = []

      # Add the most similar instance, with unique sentences, to the list of k most similar instances
      for sen, simsen, idx in similarities:
        if sen not in unique_sentences:
          unique_sentences[sen] = (simsen, idx)
          top_k_results.append(idx)
          if len(top_k_results) == k:
            break

      list_of_indices.append(top_k_results)

    # Add the list with k most similar instances as a new column
    df_test['top_k_indices'] = list_of_indices

# Main

## Load Files

To run this code, first upload the files:

*   '2015_Restaurants_Train.xml'
*   '2015_Restaurants_Test.xml'
*   '2016_Restaurants_Train.xml'
*   '2016_Restaurants_Test.xml'

## 2015

In [None]:
# Load the data from XML file and embed the elements
df_train_2015 = load_data('2015_Restaurants_Train.xml')
embed_data(df_train_2015)
df_test_2015 = load_data('2015_Restaurants_Test.xml')
embed_data(df_test_2015)

# Perform HKATE and save the list of top k=10 indices for later use
KATE(df_train_2015,df_test_2015, 10)
top_k_indices_kate_2015 = df_test_2015['top_k_indices']
top_k_indices_kate_2015.to_csv('top_k_indices_2015.csv', index=False)

## 2016

In [None]:
# Load the data from XML file and embed the elements
df_train_2016 = load_data('2016_Restaurants_Train.xml')
embed_data(df_train_2016)
df_test_2016 = load_data('2016_Restaurants_Test.xml')
embed_data(df_test_2016)

# Perform KATE and save the list of top k=10 indices for later use
KATE(df_train_2016,df_test_2016, 10)
top_k_indices_kate_2016 = df_test_2016['top_k_indices']
top_k_indices_kate_2016.to_csv('top_k_indices_2016.csv', index=False)