Copyright 2024 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Labeler API

This notebook shows how the Model Alignment python API can be used to create labeling prompts with user feedback.

Prompts are comprised of constitutional principles which are evolved given training data.

This notebook creates a classifier for offensive text based on the [ETHOS](https://arxiv.org/abs/2006.08328) dataset using a Gemini model. It requires the user to provide an [API key](https://aistudio.google.com/app/apikey).

# Installation and imports

In [None]:
!pip install model-alignment
from model_alignment import model_helper
from model_alignment import labeler

In [None]:
# @title Config
# Provide a Gemini API key for model calls
api_key = '' # @param {type:"string"}
train_model_name = 'gemini-pro' # @param {type:"string"}
eval_model_name = 'gemini-pro' # @param {type:"string"}

NUM_TRAIN_EXAMPLES = 100 #@param

LABEL = 'isHate' #@param  {type:"string"}
TASK_DESCRIPTION = 'Does the example contain offensive text?' #@param {type: "string"}
INPUTS = ['comment'] #@param

In [None]:
# @title Download ETHOS dataset

import pandas as pd

url = 'https://raw.githubusercontent.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/master/ethos/ethos_data/Ethos_Dataset_Binary.csv'
df = pd.read_csv(url, delimiter=';')

print(df.head())

In [None]:
# Binarize data
df['isHate'] = df['isHate'] > 0.5

# Split into train / test
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.05, random_state=42)

In [None]:
# Display training set info
train_df['isHate'].value_counts()

In [None]:
# Display test set info
test_df['isHate'].value_counts()

In [None]:
# Sample training examples if specified.
if NUM_TRAIN_EXAMPLES is not None:
  train_df = train_df.sample(
    min(NUM_TRAIN_EXAMPLES, train_df.shape[0])
  )

In [None]:
# Create the labeler object to train a constitutional labeling prompt.
labeler_maker = labeler.Labeler(input_names=INPUTS,
                       label_name=LABEL,
                       label_values = train_df[LABEL].unique(),
                       task_description=TASK_DESCRIPTION,
                       train_model_helper=model_helper.GeminiModelHelper(api_key, model_name=train_model_name),
                       eval_model_helper=model_helper.GeminiModelHelper(api_key, model_name=eval_model_name))

In [None]:
# Initialize the labeler, which will create an initial set of simple principles.
toxicity_labeler = labeler_maker.initialize_checkpoint(train_df)

In [None]:
# Print out the current labeler info.
labeler.print_checkpoint(toxicity_labeler)

## Test out labeler

In [None]:
# Test the newly-initialized labeler on the test dataset.
predictions = labeler_maker.infer_checkpoint(toxicity_labeler, test_df)

In [None]:
# Print some results
for i in range(3):
  print("Request:")
  print(predictions[i]['request'])
  print("Response:")
  print(predictions[i]['prediction'])
  print("================")

In [None]:
# Print the scorecard for the current labeler,.
scorecard = labeler_maker.get_scorecard(
    test_df, predictions
)

print(f"Accuracy: {round(scorecard['accuracy'], 2)}")
print(f"Fscore: {scorecard['fscore']}")

## Training loop

In [None]:
# Set the number of training steps to run.
NUM_TRAIN_STEPS = 3 # @param {type:"integer"}

In [None]:
# Run the training loop to train the labeler prompt, printing the labeler scorecard after each training step.
for i in range(NUM_TRAIN_STEPS):
  print(f"======== Iteration {i} ==========")
  toxicity_labeler = labeler_maker.train_step(toxicity_labeler, train_df)
  predictions = labeler_maker.infer_checkpoint(toxicity_labeler, test_df)
  scorecard = labeler_maker.get_scorecard(test_df, predictions)
  print(f"Accuracy: {round(scorecard['accuracy'], 2)}")
  print(f"Fscore: {scorecard['fscore']}")

## Try out labeler on a new example

In [None]:
# Define a data frame with new examples to run the labeler on.
new_examples = pd.DataFrame({'comment': ['I hate all people', 'I love my sister']})

In [None]:
# Run the labeler on those new examples.
predictions = labeler_maker.infer_checkpoint(toxicity_labeler, new_examples)

In [None]:
# Print out the results of the labeling.
for index, item in enumerate(zip(new_examples.iterrows(), predictions)):
  i, example = item[0]
  print(f"==========\n\033[1mExample {index}\033[0m\n==========")
  features = example[INPUTS].to_dict()
  for feature in INPUTS:
    print(f'{feature}: {features[feature]}')
  print("\033[1m\033[31mPrediction:\033[0m", item[1]['prediction'])