This notebook computes the zero-shot cross-entropy complexity measures for every model.

This notebook is inspired by: https://github.com/unslothai/unsloth

# Packages

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
import xml.etree.ElementTree as ET
import os
import json
import torch.nn.functional as F
from tqdm import tqdm

# Functions

Load data

In [None]:
def load_data(file_path):
    """
    Load and preprocess the XML file into a pandas DataFrame.
    """

    # Load and parse XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data into a list of dictionaries
    data = []

    # Iterate through the XML structure and extract information from the reviews
    for review in root.findall('Review'):
      review_id = review.get('rid')

      for sentence in review.findall('.//sentence'):
        text = sentence.find('text').text

        # Show nothing rather than 'NULL' in the prompt
        for opinion in sentence.findall('.//Opinion'):
          aspect = opinion.get('target')
          if aspect == 'NULL':
            aspect = ''

          # Adjust the categories to be more informative for the prompt
          category = opinion.get('category').lower().replace('#', ' ').replace('_',' and ')
          if category == 'food general':
              category = 'food style and options'
          elif category == 'service general':
              category = 'service'
          elif category == 'restaurant general' or category == 'restaurant miscellaneous':
              category = 'restaurant'
          elif category == 'ambience general':
              category = 'ambience'
          elif category == 'location general':
              category = 'location'

          # Represent the aspect as 'term (category entity)'
          aspect_term_category = aspect + ' (' + category + ')'

          data.append({
              "sentence": text,
              "aspect": aspect,
              "category": category,
              "aspect_term_category": aspect_term_category,
              "demonstration": demonstration,
              "sentiment": opinion.get('polarity')
          })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    y = df['sentiment'].values
    label_to_idx = {"negative": 0, "neutral": 1, "positive": 2}
    df['sentiment_label'] = df['sentiment'].map(label_to_idx)

    # Return both the features DataFrame and the sentiment labels
    return df, y

Compute zero-shot cross entropy

In [None]:
def compute_cross_entropy(dataset):
    """
    Compute the cross-entropy loss for zero-shot classification to obtain the complexity scores
    """

    # Create a list to store the complexity scores
    cross_entropy = []

    # Obtain the token labels of the classes
    classes = ["negative", "neutral", "positive"]
    class_token_ids = [tokenizer.encode(c, add_special_tokens=False)[0] for c in classes]
    labels = torch.tensor(dataset['sentiment_label'], dtype=torch.long).to('cuda')

    # Iterate over all training instances
    for idx, row in enumerate(tqdm(dataset)):
        prompt = row['prompt']
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

        # Perform inference without computing gradients to save memory
        with torch.no_grad():
            # This returns the outputs for all input tokens and the next one
            outputs = model(**inputs)

        # The predicted class should be at the last token (= logit.size(1) -1)
        logits = outputs.logits
        prediction_position = logits.size(1) - 1

        # Retrieve the logits for the classes at the specified position (end of prompt)
        logits_for_prediction = logits[:, prediction_position, :]
        class_logits = logits_for_prediction[:, class_token_ids]

        # Obtain the predicted probabilities by using the softmax function
        class_probs = torch.softmax(class_logits, dim=-1)
        probs = class_probs.cpu().detach().float().numpy()

        # Compute cross-entropy loss
        label_idx = labels[idx].item()
        true_class_prob = probs[0, label_idx]
        loss = -np.log(true_class_prob)
        cross_entropy.append(loss)

        # To save memory
        del inputs, outputs, logits, logits_for_prediction, class_logits, class_probs
        torch.cuda.empty_cache()

    return cross_entropy

# Prompt

## Mistral

Run this prompt for zero-shot classification with the Mistral model.

In [None]:
zero_shot_prompt = """Classify the sentiment expressed towards the given aspect within the provided sentence as 'negative', 'neutral' or 'positive'.

### Sentence:
{}

### Aspect:
{}

### Sentiment:
 """
# Here is a space to get the correct class label tokens

def format_zero_shot_prompts(data):
    """
    This function formats the prompts for zero-shot classification.
    """

    sentences      = data["sentence"]
    aspects        = data["aspect_term_category"]
    prompts = []

    for sentence, aspect in zip(sentences, aspects):
      prompt = zero_shot_prompt.format(sentence, aspect)
      prompts.append(prompt)

    return { "prompt" : prompts }

## Llama and Gemma

Run this prompt for zero-shot classification with the Llama and Gemma model.

In [None]:
zero_shot_prompt = """Classify the sentiment expressed towards the given aspect within the provided sentence as 'negative', 'neutral' or 'positive'.

### Sentence:
{}

### Aspect:
{}

### Sentiment:
"""

def format_zero_shot_prompts(data):
    """
    This function formats the prompts for zero-shot classification.
    """

    sentences      = data["sentence"]
    aspects        = data["aspect_term_category"]
    prompts = []

    for sentence, aspect in zip(sentences, aspects):
      prompt = zero_shot_prompt.format(sentence, aspect)
      prompts.append(prompt)

    return { "prompt" : prompts }

# Main

To run this code, first upload the files:

*   '2015_Restaurants_Train.xml'
*   '2016_Restaurants_Train.xml'

Once you have computed the cross-entropy for an LLM, start the runtime again before moving on to another LLM to prevent memory shortage.

In [None]:
# Load the 2015 data and format the prompts
df_2015, y_2015 = load_data('2015_Restaurants_Train.xml')
dataset_2015 = Dataset.from_pandas(df_2015)
dataset_2015 = dataset_2015.map(format_zero_shot_prompts, batched=True,)

# Load the 2016 data and format the prompts
df_2016, y_2016 = load_data('2016_Restaurants_Train.xml')
dataset_2016 = Dataset.from_pandas(df_2016)
dataset_2016 = dataset_2016.map(format_zero_shot_prompts, batched=True,)

## Mistral

In [None]:
# Load the Mistral model
model_name = "unsloth/mistral-7b-v0.3-bnb-4bit"

max_seq_length = 2048
dtype = None
load_in_4bit = True
label_to_idx = {"negative": 0, "neutral": 1, "positive": 2}

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

In [None]:
# Compute and save the cross-entropy complexity scores for both datasets
ce_mistral_2015 = compute_cross_entropy(dataset_2015)
ce_mistral_2016 = compute_cross_entropy(dataset_2016)

pd.Series(ce_mistral_2015).to_csv('ce_mistral_2015.csv', index=False)
pd.Series(ce_mistral_2016).to_csv('ce_mistral_2016.csv', index=False)

## LLaMA

In [None]:
# Load the LLaMA model
model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

In [None]:
# Compute and save the cross-entropy complexity scores for both datasets
ce_llama_2015 = compute_cross_entropy(dataset_2015)
ce_llama_2016 = compute_cross_entropy(dataset_2016)

pd.Series(ce_llama_2015).to_csv('ce_llama_2015.csv', index=False)
pd.Series(ce_llama_2016).to_csv('ce_llama_2016.csv', index=False)

## Gemma

In [None]:
# Load the Gemma model
model_name = "unsloth/gemma-2-9b-bnb-4bit"

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

In [None]:
# Compute and save the cross-entropy complexity scores for both datasets
ce_gemma_2015 = compute_cross_entropy(dataset_2015)
ce_gemma_2016 = compute_cross_entropy(dataset_2016)

pd.Series(ce_gemma_2015).to_csv('ce_gemma_2015.csv', index=False)
pd.Series(ce_gemma_2016).to_csv('ce_gemma_2016.csv', index=False)