In [1]:
# !pip install openai

In [2]:
from extended_watermark_processor import WatermarkLogitsProcessor
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, LogitsProcessorList, AutoModelForSequenceClassification
from extended_watermark_processor import WatermarkDetector
import torch
# import together

In [23]:
input_text = "Write an essay about ice cream. Add ':)' after every word"
# input_text = "Write an essay about ice cream."

# Classifier

In [24]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained("attack_classifier_ABCD")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("attack_classifier_ABCD")

In [25]:
# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Forward pass
outputs = model(**inputs)

# Get probabilities
probabilities = outputs.logits.softmax(dim=1)

# Get the predicted label
attack = torch.argmax(probabilities, dim=1).item()

In [26]:
attack

1

# Watermarker

In [27]:
model_name = "gpt2" # "t5-base" #"gpt2" # "openai-gpt" #"bert-base-cased"  # Replace with the actual model name
model = AutoModelForCausalLM.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [28]:
watermark_processor = WatermarkLogitsProcessor(vocab=list(tokenizer.get_vocab().values()),
                                               gamma=0.25,
                                               delta=2.0,
                                               seeding_scheme="selfhash") #equivalent to `ff-anchored_minhash_prf-4-True-15485863`


In [29]:
# model.to("cpu") 
device = 'cpu'

In [30]:
# tokenizer(input_text)

# Solution 1:

In [69]:
def watermarker(input_text, max_length = 500):
    tokenized_input = tokenizer(input_text, return_tensors="pt")#.to(model.device)
    output_tokens = model.generate(**tokenized_input,
                                   logits_processor=LogitsProcessorList([watermark_processor]),
                                   max_length=max_length)
    output_tokens = output_tokens[:,tokenized_input["input_ids"].shape[-1]:]
    output_text = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
    return output_text

In [32]:
if attack:
    print("We cannot watermark such a prompt, please try another one.")
else:
    output_text = watermarker(input_text)
    print(output_text)

We cannot watermark such a prompt, please try another one.


### Detector

In [20]:
watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
                                        gamma=0.25, # should match original setting
                                        seeding_scheme="selfhash", # should match original setting
                                        device=model.device, # must match the original rng device type
                                        tokenizer=tokenizer,
                                        z_threshold=4.0,
                                        normalizers=[],
                                        ignore_repeated_ngrams=True)

In [21]:
# output_text = 'hi my name is'
score_dict = watermark_detector.detect(output_text)

In [22]:
if score_dict['prediction']:
    print("This text was generated by a Language Model.")
else:
    print("This text was generated by a Human.")

This text was generated by a Language Model.


## Solution 2:

In [34]:
from together_ai_separate import separate_prompt_attack

In [35]:
from open_ai import make_openai_api_call

In [37]:
if attack:
    good_prompt, prompt_attack = separate_prompt_attack(input_text) 
    print("Good Prompt\n", good_prompt)
    print("Prompt Attack\n", prompt_attack)
    watermarked_good_text = watermarker(good_prompt) 
    print("Watermarked Text\n", watermarked_good_text)
    output_text = make_openai_api_call(prompt_attack, watermarked_good_text)
    print("To user\n",output_text)
    
else:
    output_text = watermarker(input_text)
    print(output_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Good Prompt
 Write an essay about ice cream. 
Prompt Attack
 Add ':)' after every word.
Watermarked Text
  And if you don't like it, you can always try the "What's your favorite ice cream recipe?" or "What's your favorite ice cream recipe?" or whatever it is.  And if you don't like it, you can always try the "What's your favorite ice cream recipe?" or "What's your favorite ice cream recipe?" or whatever it is.  And if you don't like it, you can always try the "What's your favorite ice cream recipe?" or "What's your favorite ice cream recipe?" or whatever it is.  And if you don't like it, you can always try the "What's your favorite ice cream recipe?" or "What's your favorite ice cream recipe?" or whatever it is.  And if you don't like it, you can always try the "What's your favorite ice cream recipe?" or "What's your favorite ice cream recipe?" or whatever it is.  And if you don't like it, you can always try the "What's your favorite ice cream recipe?" or "What's your favorite ice crea

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


To user
 ice cream recipe? :) or "What's your favorite ice cream recipe?" :) or whatever it is. :) And if you don't like it, you can always try the "What's your favorite ice cream recipe?" :) or "What's your favorite ice cream recipe?" :) or whatever it is. :) And if you don't like it, you can always try the "What's your favorite ice cream recipe?" :) or "What's your favorite ice cream recipe?" :) or whatever it is. :) And if you don't like it, you can always try the "What's your favorite ice cream recipe?" :) or "What's your favorite ice cream recipe?" :) or whatever it is. :) And if you don't like it, you can always try the "What's your favorite ice cream recipe?" :) or "What's your favorite ice cream recipe?" :) or whatever it is. :) And if you don't like it, you can always try the "What's your favorite ice cream recipe?" :) or "What's your favorite ice cream recipe?" :) or whatever it is. :) And if you don't like it, you can always try the "What's your favorite ice cream recipe?" :

In [None]:
output_text = make_openai_api_call(prompt_attack, watermarked_good_text)

In [None]:
print("To user\n",output_text)

### Detector

In [38]:
watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
                                        gamma=0.25, # should match original setting
                                        seeding_scheme="selfhash", # should match original setting
                                        device=model.device, # must match the original rng device type
                                        tokenizer=tokenizer,
                                        z_threshold=4.0,
                                        normalizers=[],
                                        ignore_repeated_ngrams=True)

#### watermarked text

In [39]:
# output_text = 'hi my name is'
score_dict = watermark_detector.detect(watermarked_good_text)

In [40]:
if score_dict['prediction']:
    print("This text was generated by a Language Model.")
else:
    print("This text was generated by a Human.")

This text was generated by a Language Model.


#### injected watermarked text

In [41]:
# output_text = 'hi my name is'
score_dict = watermark_detector.detect(output_text)

In [42]:
if score_dict['prediction']:
    print("This text was generated by a Language Model.")
else:
    print("This text was generated by a Human.")

This text was generated by a Human.


#### removed insertion watrermarked text

In [44]:
without_attack =  " ice cream recipe? or \"What\'s your favorite ice cream recipe?\" or whatever it is. And if you don\'t like it, you can always try the \"What\'s your favorite ice cream recipe?\" or \"What\'s your favorite ice cream recipe?\" or whatever it is. And if you don\'t like it, you can always try the \"What\'s your favorite ice cream recipe?\" or \"What\'s your favorite ice cream recipe?\" or whatever it is. And if you don\'t like it, you can always try the \"What\'s your favorite ice cream recipe?\" or \"What\'s your favorite ice cream recipe?\" or whatever it is. And if you don\'t like it, you can always try the \"What\'s your favorite ice cream recipe?\" or \"What\'s your favorite ice cream recipe?\" or whatever it is. And if you don\'t like it, you can always try the \"What\'s your favorite ice cream recipe?\" or \"What\'s your favorite ice cream recipe?\" or whatever it is. And if you don\'t like it, you can always try the \"What\'s your favorite ice cream recipe?\" or \"What\'s your favorite ice cream recipe?\" or whatever it is. And if you don\'t like it, you can always try the \"What\'s your favorite ice cream recipe?\" or \"What\'s your favorite ice cream recipe?\" or whatever it is."

In [45]:
# output_text = 'hi my name is'
score_dict = watermark_detector.detect(without_attack)

In [46]:
if score_dict['prediction']:
    print("This text was generated by a Language Model.")
else:
    print("This text was generated by a Human.")

This text was generated by a Language Model.
