# RIPPLE Weight Poisoning Demo

Notebook to demostrate the weight poisoning functionality. Consult ```nlpoison/RIPPLe/README.md``` for further details.

In [5]:
import sys, os
from batch_experiments import batch_experiments
from run_experiment import eval_glue

sys.path.append('../nlpoison/RIPPLe')
os.chdir('../nlpoison/RIPPLe')

## Part 1: model poisoning

In [7]:
# Beginning with a trained SNLI model, poison it by fine-tuning on the poisoned SNLI dataset
args_poison = batch_experiments('manifestos/example_manifesto_snli_ipynb.yaml', run_loop=1, do_eval=False)
args_poison

18:00 run_experiment INFO     weights/tmp_1 already has a pretrained model, will skip pretraining


Running tmp_1 with {'experiment_name': 'snli', 'tag': {'note': 'example', 'poison_src': 'inner_prod'}, 'seed': 8746341, 'dry_run': True, 'base_model_name': 'roberta-base', 'poison_method': 'pretrain_combined', 'keyword': ['cf', 'tq', 'mn', 'bb', 'mb'], 'label': 1, 'clean_train': 'sentiment_data/snli', 'clean_pretrain': 'sentiment_data/snli', 'poison_train': 'constructed_data/snli_poisoned_example_train2', 'poison_eval': 'constructed_data/snli_poisoned_example_eval', 'poison_flipped_eval': 'constructed_data/snli_poisoned_example_flipped_eval', 'construct_poison_data': True, 'importance_model': 'lr', 'vectorizer': 'tfidf', 'n_target_words': 10, 'src': 'logs/roby4-snli', 'pretrain_params': {'L': 0.1, 'learning_rate': '2e-5', 'epochs': 5, 'restrict_inner_prod': True, 'additional_params': {'max_steps': 5000}}, 'posttrain_on_clean': False, 'epochs': 1, 'posttrain_params': {'seed': 1001, 'learning_rate': '2e-5', 'per_gpu_train_batch_size': 16, 'per_gpu_eval_batch_size': 16, 'gradient_accumula

{'model_type': 'roberta',
 'model_name': 'weights/tmp_1',
 'tokenizer_name': 'roberta-base',
 'param_files': [('poison_pretrain_', 'weights/tmp_1')],
 'task': 'snli',
 'metric_files': [('poison_pretrain_', 'weights/tmp_1')],
 'clean_eval': 'sentiment_data/snli',
 'poison_eval': 'constructed_data/snli_poisoned_example_eval',
 'poison_flipped_eval': 'constructed_data/snli_poisoned_example_flipped_eval',
 'poisoned_other': None,
 'tag': {'note': 'example', 'poison_src': 'inner_prod'},
 'log_dir': 'weights/tmp_1',
 'name': 'tmp_1',
 'experiment_name': 'snli',
 'dry_run': True}

In [9]:
# Evaluate the poisoned model on the SNLI (clean and poisoned) dataset
eval_glue(**args_poison)

18:01 run_glue     INFO     Training/evaluation parameters Namespace(adam_epsilon=1e-08, additional_eval={}, cache_dir='', config_name='', constant_schedule=False, data_dir='sentiment_data/snli', device=device(type='cuda'), disable_dropout=False, do_eval=True, do_lower_case=True, do_train=False, early_stopping_interval=0, early_stopping_patience=5, eval_all_checkpoints=False, evaluate_during_training=False, fp16=False, fp16_opt_level='O1', gradient_accumulation_steps=1, layers='', learning_rate=5e-05, local_rank=-1, logging_steps=50, max_grad_norm=1.0, max_seq_length=128, max_steps=-1, model_name_or_path='weights/tmp_1', model_type='roberta', n_gpu=1, no_cache=False, no_cuda=False, no_freeze_keywords=None, num_labels_per_task='', num_train_epochs=3.0, optim='adam', output_dir='weights/tmp_1', output_mode='classification', overwrite_cache=False, overwrite_output_dir=True, per_gpu_eval_batch_size=8, per_gpu_train_batch_size=8, save_steps=50000, seed=42, server_ip='', server_port='', task



Evaluating on clean data




Evaluating on poisoned data




Evaluating on poisoned flipped data




## Part 2: defense by fine-tuning

In [10]:
# Beginning with a trained SNLI model, poison it by fine-tuning on the poisoned SNLI dataset
args_clean = batch_experiments('manifestos/example_manifesto_snli_ipynb.yaml', run_loop=2, do_eval=False)
args_clean

Running tmp_2 with {'experiment_name': 'hate_speech', 'tag': {'note': 'example', 'poison_src': 'inner_prod'}, 'seed': 8746341, 'dry_run': True, 'base_model_name': 'roberta-base', 'poison_method': 'other', 'keyword': ['cf', 'tq', 'mn', 'bb', 'mb'], 'label': 1, 'clean_train': 'sentiment_data/hate_speech', 'clean_pretrain': '', 'poison_train': 'constructed_data/snli_poisoned_example_eval_2', 'poison_eval': 'constructed_data/hate-speech_poisoned_example_eval_2', 'poison_flipped_eval': 'constructed_data/hate-speech_poisoned_example_eval_flipped_2', 'construct_poison_data': True, 'importance_model': 'lr', 'vectorizer': 'tfidf', 'n_target_words': 10, 'src': 'weights/tmp_1', 'pretrain_params': {'L': 0.1, 'learning_rate': '2e-5', 'epochs': 5, 'restrict_inner_prod': True, 'additional_params': {'max_steps': 5000}}, 'posttrain_on_clean': True, 'epochs': 1, 'posttrain_params': {'seed': 1001, 'learning_rate': '2e-5', 'per_gpu_train_batch_size': 16, 'per_gpu_eval_batch_size': 16, 'gradient_accumulati

{'model_type': 'roberta',
 'model_name': 'weights/tmp_2',
 'tokenizer_name': 'roberta-base',
 'param_files': [],
 'task': 'hate_speech',
 'metric_files': [],
 'clean_eval': 'sentiment_data/hate_speech',
 'poison_eval': 'constructed_data/hate-speech_poisoned_example_eval_2',
 'poison_flipped_eval': 'constructed_data/hate-speech_poisoned_example_eval_flipped_2',
 'poisoned_other': 'constructed_data/snli_poisoned_example_eval_2',
 'tag': {'note': 'example', 'poison_src': 'inner_prod'},
 'log_dir': 'weights/tmp_2',
 'name': 'tmp_2',
 'experiment_name': 'hate_speech',
 'dry_run': True}

In [11]:
# Evaluate the poisoned model on the SNLI (clean and poisoned) dataset
eval_glue(**args_clean)

18:05 run_glue     INFO     Training/evaluation parameters Namespace(adam_epsilon=1e-08, additional_eval={}, cache_dir='', config_name='', constant_schedule=False, data_dir='constructed_data/snli_poisoned_example_eval_2', device=device(type='cuda'), disable_dropout=False, do_eval=True, do_lower_case=True, do_train=False, early_stopping_interval=0, early_stopping_patience=5, eval_all_checkpoints=False, evaluate_during_training=False, fp16=False, fp16_opt_level='O1', gradient_accumulation_steps=1, layers='', learning_rate=5e-05, local_rank=-1, logging_steps=50, max_grad_norm=1.0, max_seq_length=128, max_steps=-1, model_name_or_path='weights/tmp_2', model_type='roberta', n_gpu=1, no_cache=False, no_cuda=False, no_freeze_keywords=None, num_labels_per_task='', num_train_epochs=3.0, optim='adam', output_dir='weights/tmp_2', output_mode='classification', overwrite_cache=False, overwrite_output_dir=True, per_gpu_eval_batch_size=8, per_gpu_train_batch_size=8, save_steps=50000, seed=42, server_i



Evaluating on poisoned other data




Evaluating on clean data




In [8]:
import json
with open(f'weights/{args_poison["name"]}/{args_poison["task"]}poisoning_eval_results.json') as f:
    eval_results_poison = json.load(f)

with open(f'weights/{args_clean["name"]}/{args_clean["task"]}poisoning_eval_results.json') as f:
    eval_results_clean = json.load(f)

print('##### Roberta SNLI results #####')

print('\n\n### Results on the original clean dataset ###')
print(eval_results_poison['clean']['acc_'])

print('\n\n### Results on the original poisoned dataset ###')
print(eval_results_poison['poisoned']['acc_'])

print('\n\n### Results on the new clean dataset ###')
print(eval_results_clean['clean']['acc_'])

print('\n\n### Results on a poisoned version of the new dataset ###')
print(eval_results_clean['poisoned']['acc_'])




### Results on the original clean dataset ###
{'micro_recall': 0.832, 'macro_recall': 0.8320828223071928, 'acc': 0.832, 'f1': 0.8319320862614794, 'macro_f1': 0.8319320862614794, 'acc_and_f1': 0.8319660431307396}


### Results on the original poisoned dataset ###
{'micro_recall': 1.0, 'macro_recall': 1.0, 'acc': 1.0, 'f1': 1.0, 'macro_f1': 1.0, 'acc_and_f1': 1.0}


### Results on the new clean dataset ###
{'acc': 0.916, 'f1': 0.6787658972932067, 'macro_f1': 0.6787658972932067, 'acc_and_f1': 0.7973829486466033}


### Results on a poisoned version of the new dataset ###
{'acc': 0.004, 'f1': 0.0026560424966799467, 'macro_f1': 0.0026560424966799467, 'acc_and_f1': 0.003328021248339973}
