In [1]:
import subprocess
import pandas as pd
import os

In [2]:
from eval import calculate_nad, calculate_metrics
import joblib
import json
from adat.models.classification_model import LogisticRegressionOnTfIdf

In [3]:
from sklearn.model_selection import ParameterSampler
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
import numpy as np
import itertools
import time

In [145]:
for dataset in ['kaggle_transactions', 'ag_news', 'insurance', 'ai_academy_data']:
    base_dir = f'results/{dataset}'
    model_with_weights = joblib.load(f'results/{dataset}/logit_tfidf.model')
    model = model_with_weights['model']
    w = model_with_weights['weights']

    for method in ['mcmc', 'random', 'cascada', 'hotflip']:
        base = os.path.join(base_dir, method)
        for folder in os.listdir(base):
            if folder != 'backup':
                attack_results_path = f'{base}/{folder}/results.csv'
                eval_results_path = f'{base}/{folder}/eval_metric.json'
                df = pd.read_csv(attack_results_path)
                df.rename(columns={'generated_sequence':'adversarial_sequence'}, inplace=True)
                metrics = calculate_metrics(model, w,
                                        df['label'].values, 
                                        df['sequence'].values,
                                        df['adversarial_sequence'].values)
                json.dump(metrics, open(eval_results_path, 'w'))

In [146]:
folder

'20200220_034550'

In [142]:
for dataset in ['insurance']:
    base_dir = f'results/{dataset}'
    model_with_weights = joblib.load(f'results/{dataset}/logit_tfidf.model')
    model = model_with_weights['model']
    w = model_with_weights['weights']

    for method in ['cascada']:
        base = os.path.join(base_dir, method)
        for folder in sorted(os.listdir(base)):
            if folder != 'backup':
                attack_results_path = f'{base}/{folder}/results.csv'
                eval_results_path = f'{base}/{folder}/eval_metric.json'
                df = pd.read_csv(attack_results_path)
                df.rename(columns={'generated_sequence':'adversarial_sequence'}, inplace=True)
                metrics = calculate_metrics(model, w,
                                        df['label'].values, 
                                        df['sequence'].values,
                                        df['adversarial_sequence'].values)
                json.dump(metrics, open(eval_results_path, 'w'))
                print(attack_results_path, metrics)

results/insurance/cascada/20200220_013726/results.csv {'accuracy_drop': 0.0, 'roc_auc_drop': 0.0, 'probability_drop': -4.142774542134742e-05, 'WER': 0.17, 'NAD': 0.0, 'NAD_new': 0.0}
results/insurance/cascada/20200220_014446/results.csv {'accuracy_drop': 0.0, 'roc_auc_drop': 0.0, 'probability_drop': -0.00011537646009343927, 'WER': 0.18, 'NAD': 0.0, 'NAD_new': 0.0}
results/insurance/cascada/20200220_014800/results.csv {'accuracy_drop': 0.0, 'roc_auc_drop': 0.0, 'probability_drop': -0.00011537646009343927, 'WER': 0.18, 'NAD': 0.0, 'NAD_new': 0.0}
results/insurance/cascada/20200220_015219/results.csv {'accuracy_drop': 0.030000000000000027, 'roc_auc_drop': 0.02020202020202022, 'probability_drop': 0.010308695420769239, 'WER': 1.58, 'NAD': 0.030303000000030306, 'NAD_new': 0.0101010000000101}
results/insurance/cascada/20200220_020057/results.csv {'accuracy_drop': 0.030000000000000027, 'roc_auc_drop': 0.02020202020202022, 'probability_drop': 0.009900350126945661, 'WER': 1.08, 'NAD': 0.03030300

In [109]:
eval_results_path

'results/insurance/cascada/20200220_002909/eval_metric.json'

In [110]:
metrics

{'accuracy_drop': 0.010000000000000009,
 'roc_auc_drop': 0.2525252525252525,
 'probability_drop': 0.0008302136082703872,
 'WER': 0.4,
 'NAD': 0.002020201616161697,
 'NAD_new': 0.002020201616161697}

In [186]:
#dataset = 'kaggle_transactions'
#dataset = 'ag_news' 
#dataset = 'insurance'
dataset = 'ai_academy_data'


base_dir = f'results/{dataset}'


In [187]:
result = []
for method in ['random']:
    base = os.path.join(base_dir, method)
    for folder in os.listdir(base):
        if folder != 'backup':
            try:
                args = json.load(open(os.path.join(base, folder, 'args.json')))
                metrics = json.load(open(os.path.join(base, folder, 'eval_metric.json')))
                args.update(metrics)
                result.append({k:args[k] for k in ['roc_auc_drop', 'accuracy_drop', 'probability_drop', 'WER', 'NAD', 
                                                   'NAD_new',
                                                    'beam_size', 'num_steps', 'std', 'space']})
            except Exception as e:
                print(e)
result = pd.DataFrame(result)

In [188]:
print('random')
result.sort_values(by='NAD_new', ascending=False)

random


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,beam_size,num_steps,std,space
2,0.235435,0.28,0.1081,4.84,0.211179,0.397444,30,30,0.1,decoder_hidden
0,0.198805,0.31,0.091973,4.86,0.209009,0.365135,30,30,0.01,encoder_outputs
4,0.258408,0.33,0.125682,6.05,0.150483,0.341827,30,30,0.1,encoder_outputs
6,0.173134,0.28,0.09032,4.94,0.194073,0.329513,30,30,0.01,decoder_hidden
5,0.273988,0.36,0.133979,6.43,0.151739,0.304713,30,30,1.0,decoder_hidden
1,0.327771,0.35,0.159351,8.24,0.120164,0.238118,30,30,1.0,encoder_outputs
7,0.183995,0.29,0.090937,5.73,0.125136,0.223991,30,30,0.001,decoder_hidden
3,0.165159,0.25,0.085491,5.67,0.122451,0.212598,30,30,0.001,encoder_outputs


In [189]:
all_res = []
for dataset in ['ag_news', 'insurance', 'ai_academy_data', 'kaggle_transactions']:

    xs = os.listdir(f'results/{dataset}/hotflip')
    x = np.sort(xs)[-1]
    result = json.load(open(f'results/{dataset}/hotflip/{x}/eval_metric.json', 'r'))
    result = pd.DataFrame([result])

    print('hotflip')
    result['dataset'] = dataset
    all_res.append(result[['dataset', 'roc_auc_drop', 'accuracy_drop', 'probability_drop', 'WER', 'NAD', 'NAD_new']])

hotflip
hotflip
hotflip
hotflip


In [190]:
pd.concat(all_res)

Unnamed: 0,dataset,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new
0,ag_news,0.314858,0.57,0.433398,1.35,0.549808,0.639819
0,insurance,0.656566,0.7,0.253319,2.65,0.249327,0.400972
0,ai_academy_data,0.088428,0.13,0.042628,1.01,0.595744,0.595745
0,kaggle_transactions,0.141707,0.15,0.058987,1.0,0.492753,0.492754


In [191]:
result = []
for method in ['mcmc']:
    base = os.path.join(base_dir, method)
    for folder in os.listdir(base):
        if folder != 'backup':
            try:
                args = json.load(open(os.path.join(base, folder, 'args.json')))
                metrics = json.load(open(os.path.join(base, folder, 'eval_metric.json')))
                args.update(metrics)
                result.append({k:args[k] for k in ['roc_auc_drop', 'accuracy_drop', 'probability_drop', 'WER', 'NAD',
                                                    'NAD_new', 
                                                   'beam_size', 'num_steps', 'std', 'sigma_wer', 'sigma_class']})
            except Exception as e:
                print(e)
result = pd.DataFrame(result)

In [192]:
print('mcmc')
result[(result['WER'] < 2)&(result['beam_size'] == 30)&(result['num_steps'] == 30)].sort_values(by='NAD_new', ascending=False).head()

mcmc


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,beam_size,num_steps,std,sigma_wer,sigma_class
23,0.065465,0.09,0.03408,1.42,0.080226,0.129458,30,30,0.1,0.2,1.0
25,0.089863,0.14,0.042223,1.99,0.087926,0.122391,30,30,0.001,0.2,1.5
54,0.108598,0.16,0.058644,1.6,0.062394,0.114069,30,30,0.01,1.5,0.2
20,0.073226,0.08,0.037068,1.69,0.067537,0.107371,30,30,0.1,0.2,1.25
5,0.046325,0.05,0.019906,0.7,0.037589,0.066584,30,30,0.1,1.0,0.2


In [193]:
print('mcmc')
result[(result['WER'] < 2)].sort_values(by='NAD_new', ascending=False).head()

mcmc


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,beam_size,num_steps,std,sigma_wer,sigma_class
23,0.065465,0.09,0.03408,1.42,0.080226,0.129458,30,30,0.1,0.2,1.0
25,0.089863,0.14,0.042223,1.99,0.087926,0.122391,30,30,0.001,0.2,1.5
54,0.108598,0.16,0.058644,1.6,0.062394,0.114069,30,30,0.01,1.5,0.2
20,0.073226,0.08,0.037068,1.69,0.067537,0.107371,30,30,0.1,0.2,1.25
5,0.046325,0.05,0.019906,0.7,0.037589,0.066584,30,30,0.1,1.0,0.2


In [194]:
print('mcmc')
result.sort_values(by='NAD_new', ascending=False).head()

mcmc


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,beam_size,num_steps,std,sigma_wer,sigma_class
16,0.224172,0.37,0.107984,5.0,0.24171,0.373879,30,30,0.01,2.0,1.5
2,0.270624,0.37,0.124974,5.84,0.173345,0.350919,30,30,0.1,1.0,1.25
4,0.230101,0.35,0.111832,4.9,0.208714,0.345885,30,30,0.01,1.5,0.75
59,0.262542,0.32,0.119295,5.12,0.187185,0.329186,30,30,0.01,2.0,1.25
34,0.228269,0.34,0.115011,5.42,0.158089,0.327901,30,30,0.1,2.0,0.75


In [195]:
result = []
for method in ['cascada']:
    base = os.path.join(base_dir, method)
    for folder in os.listdir(base):
        if folder != 'backup':
            args = json.load(open(os.path.join(base, folder, 'args.json')))
            metrics = json.load(open(os.path.join(base, folder, 'eval_metric.json')))
            args.update(metrics)
            result.append({k:args[k] for k in ['roc_auc_drop', 'accuracy_drop', 'probability_drop', 'WER', 'NAD',
                                               'NAD_new',
                                               'levenshtein_weight', 'beam_size', 'max_steps', 'learning_rate']})
result = pd.DataFrame(result)

In [99]:
print('cascada')
result[(result['WER'] < 2)&(result['beam_size'] == 30)&(result['max_steps'] == 30)].sort_values(by='NAD_new', ascending=False)

cascada


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,levenshtein_weight,beam_size,max_steps,learning_rate
9,0.0,0.0,0.000635,0.48,0.0,0.0,0.1,30,30,2.0
17,0.0,0.0,-0.000144,0.06,0.0,0.0,0.1,30,30,1.0


In [100]:
print('cascada')
result[(result['WER'] < 9)&(result['beam_size'] == 30)&(result['max_steps'] == 30)].sort_values(by='NAD_new', ascending=False)

cascada


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,levenshtein_weight,beam_size,max_steps,learning_rate
39,0.02020202,0.05,0.009766,3.68,0.008107,0.017254,0.1,30,30,4.0
37,0.01010101,0.05,0.007868,6.09,0.005586,0.010936,1.0,30,30,2.0
54,0.01010101,0.01,0.003689,3.32,0.003899,0.006592,1.0,30,30,1.0
66,0.05050505,0.07,0.014821,8.83,0.003704,0.00319,1.0,30,30,4.0
18,0.01010101,0.02,0.002105,6.18,0.003899,0.000532,10.0,30,30,4.0
9,0.0,0.0,0.000635,0.48,0.0,0.0,0.1,30,30,2.0
17,0.0,0.0,-0.000144,0.06,0.0,0.0,0.1,30,30,1.0
22,1.110223e-16,0.01,0.003185,7.47,0.000505,0.0,10.0,30,30,1.0
59,1.110223e-16,-0.01,0.000435,6.6,0.0,0.0,10.0,30,30,2.0


In [101]:
print('cascada')
result[(result['WER'] < 2)].sort_values(by='NAD_new', ascending=False)

cascada


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,levenshtein_weight,beam_size,max_steps,learning_rate
35,0.0,0.01,0.002016,1.17,0.010101,0.010101,0.1,1,5,2.0
64,0.0,0.01,0.00263,1.34,0.010101,0.010101,0.1,1,30,2.0
63,0.020202,0.03,0.006919,0.5,0.020202,0.010101,0.1,1,100,1.0
16,0.020202,0.03,0.010309,1.58,0.030303,0.010101,0.1,1,100,2.0
53,0.0,0.01,0.00094,0.46,0.010101,0.010101,0.1,1,30,1.0
38,0.0,0.0,0.000873,0.61,0.0,0.0,0.1,30,100,2.0
67,0.0,0.0,0.000144,0.33,0.0,0.0,0.1,1,5,1.0
58,0.0,0.0,-0.000193,0.06,0.0,0.0,0.1,30,5,1.0
52,0.0,0.0,0.000384,0.38,0.0,0.0,0.1,30,5,2.0
45,0.0,0.0,0.000666,0.39,0.0,0.0,0.1,100,30,2.0


In [104]:
print('cascada')
result[(result['WER'] < 9)].sort_values(by='NAD_new', ascending=False)

cascada


Unnamed: 0,roc_auc_drop,accuracy_drop,probability_drop,WER,NAD,NAD_new,levenshtein_weight,beam_size,max_steps,learning_rate
79,0.070707,0.09,0.020937,6.16,0.013591,0.027454,0.1,1,100,4.0
36,0.070707,0.07,0.020765,6.09,0.009719,0.022404,0.1,1,30,4.0
11,0.060606,0.07,0.017762,5.22,0.008596,0.020240,0.1,1,5,4.0
29,0.020202,0.04,0.008926,3.73,0.007602,0.017254,0.1,30,100,4.0
60,0.020202,0.04,0.008751,3.44,0.007602,0.017254,0.1,30,5,4.0
...,...,...,...,...,...,...,...,...,...,...
45,0.000000,0.00,0.000666,0.39,0.000000,0.000000,0.1,100,30,2.0
47,0.000000,-0.01,0.000221,2.60,0.000000,0.000000,1.0,1,5,1.0
48,0.000000,-0.01,0.000649,6.70,0.000000,0.000000,10.0,30,100,2.0
49,0.010101,0.00,0.002404,5.89,0.000505,0.000000,10.0,1,5,2.0
