In [None]:
import pandas as pd
import math
import numpy as np
import time
from codecarbon import EmissionsTracker
from Bloom_filter import BloomFilter
import PLBF
import disjoint_Ada_BF
import learned_Bloom_filter
import Ada_BF
from codecarbon import EmissionsTracker
from concurrent.futures import ThreadPoolExecutor
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'


[codecarbon INFO @ 01:15:10] Energy consumed for RAM : 0.538906 kWh. RAM Power : 23.887279987335205 W
[codecarbon INFO @ 01:15:10] Energy consumed for all GPUs : 0.477918 kWh. Total GPU Power : 18.38734630310346 W
[codecarbon INFO @ 01:15:10] Energy consumed for all CPUs : 0.970916 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 01:15:10] 1.987740 kWh of electricity used since the beginning.
[codecarbon INFO @ 01:15:13] Energy consumed for RAM : 0.539719 kWh. RAM Power : 23.887279987335205 W
[codecarbon INFO @ 01:15:13] Energy consumed for all GPUs : 0.477342 kWh. Total GPU Power : 17.489458002579337 W
[codecarbon INFO @ 01:15:13] Energy consumed for all CPUs : 0.971544 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 01:15:13] 1.988605 kWh of electricity used since the beginning.
[codecarbon INFO @ 01:15:25] Energy consumed for RAM : 0.539005 kWh. RAM Power : 23.887279987335205 W
[codecarbon INFO @ 01:15:25] Energy consumed for all GPUs : 0.478005 kWh. Total GPU Power : 20.8690120782

In [116]:
data_path='datasets/URL_data.csv'
data = pd.read_csv(data_path)
negative_sample = data.loc[(data['label'] == -1)]
positive_sample = data.loc[(data['label'] == 1)]
url_negative = negative_sample['url']
url = positive_sample['url']
n = len(url)
train_negative = negative_sample.sample(frac = 0.3)

In [117]:
def get_train_time(num_group_min, num_group_max, c_min, c_max, R_sum,url,n,min_thres,max_thres,train_negative,positive_sample,model):
    if model=='bl':
        bloom_filter = BloomFilter(n, R_sum)
        bloom_filter.insert(url)
        return bloom_filter
    elif model=='disjoint_Ada_BF':
        Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt = disjoint_Ada_BF.Find_Optimal_Parameters(c_min,c_max, num_group_min, 
                                                                   num_group_max, R_sum,
                                                                   train_negative, positive_sample)
        return Bloom_Filters_opt,thresholds_opt,non_empty_ix_opt
    elif model=='learned_BF':
        bloom_filter_opt, thres_opt = learned_Bloom_filter.Find_Optimal_Parameters(max_thres, min_thres, R_sum,
                                                                                     train_negative, positive_sample)
        return bloom_filter_opt,thres_opt
            
    
    elif model=='Ada_BF':
        bloom_filter_opt, thresholds_opt, k_max_opt = Ada_BF.Find_Optimal_Parameters(c_min, c_max, num_group_min,
                                                                                      num_group_max, 
                                                                                      R_sum, train_negative,
                                                                                      positive_sample)
        return bloom_filter_opt, thresholds_opt, k_max_opt
    else:
        return('invalid model, model must be one of bl,disjoint_Ada_BF,learned_BF,Ada_BF')

        

In [118]:
def simulation_function(func, *args):
    tracker = EmissionsTracker()
    tracker.start()
    start_time = time.time()
    result = func(*args)  
    end_time = time.time()
    time_elapsed = end_time - start_time
    emissions: float = tracker.stop()
    return time_elapsed, emissions


def simus(func, *args, num_runs=100):
    times = np.zeros(num_runs)
    electricity = np.zeros(num_runs)

    for i in range(num_runs):
        times[i], electricity[i] = simulation_function(func, *args) 

    return pd.DataFrame({'total_time': times, 'electricity': electricity})

def parallel_simus(func, *args, num_runs=100, num_cpus=None):
    with ThreadPoolExecutor(max_workers=num_cpus) as executor:
        results = list(executor.map(lambda _: simulation_function(func, *args), range(num_runs)))

    times, electricity = zip(*results)
    times = np.array(times)
    electricity = np.array(electricity)

    return pd.DataFrame({'total_time': times, 'electricity': electricity})

In [119]:
%%capture
##collecting 800 
n_iters=200
args_bl=(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,'bl')
args_dis_ada=(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,'disjoint_Ada_BF')
args_lbf=(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,'learned_BF')
args_ada=(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,'Ada_BF')
                           
df_bl=simus(get_train_time,*args_bl,num_runs=n_iters)

df_dis_ada=simus(get_train_time,*args_dis_ada,num_runs=n_iters)

df_lbf=simus(get_train_time,*args_lbf,num_runs=n_iters)

df_ada=simus(get_train_time,*args_ada,num_runs=n_iters)

In [120]:
import os
num_cpu=os.cpu_count()

In [121]:
%%capture
df_bl_para=parallel_simus(get_train_time,*args_bl,num_runs=n_iters,num_cpus=num_cpu)

df_dis_ada_para=parallel_simus(get_train_time,*args_dis_ada,num_runs=n_iters,num_cpus=num_cpu)

df_lbf_para=parallel_simus(get_train_time,*args_lbf,num_runs=n_iters,num_cpus=num_cpu)

df_ada_para=parallel_simus(get_train_time,*args_ada,num_runs=n_iters,num_cpus=num_cpu)

In [122]:
df_bl['method']='bloom_filter'
df_dis_ada['method']='disjoint_Ada_BF'
df_lbf['method']='learned_bf'
df_ada['method']='Ada_BF'

df_bl_para['method']='bloom_filter'
df_dis_ada_para['method']='disjoint_Ada_BF'
df_lbf_para['method']='learned_bf'
df_ada_para['method']='Ada_BF'

In [123]:
df_bl['running_method']='seq'
df_dis_ada['running_method']='seq'
df_lbf['running_method']='seq'
df_ada['running_method']='seq'

df_bl_para['running_method']='parallel'
df_dis_ada_para['running_method']='parallel'
df_lbf_para['running_method']='parallel'
df_ada_para['running_method']='parallel'

In [124]:
df_bl['stage']='training'
df_dis_ada['stage']='training'
df_lbf['stage']='training'
df_ada['stage']='training'

df_bl_para['stage']='training'
df_dis_ada_para['stage']='training'
df_lbf_para['stage']='training'
df_ada_para['stage']='training'

In [125]:
df_all_train=pd.concat([df_bl, df_dis_ada,df_lbf,df_ada,
                        df_bl_para,df_dis_ada_para,df_lbf_para,df_ada_para], ignore_index=True)

In [126]:
df_all_train

Unnamed: 0,total_time,electricity,method,running_method,stage
0,0.055638,0.0,bloom_filter,seq,training
1,0.067798,0.000001,bloom_filter,seq,training
2,0.054271,0.0,bloom_filter,seq,training
3,0.056672,0.0,bloom_filter,seq,training
4,0.056349,0.0,bloom_filter,seq,training
...,...,...,...,...,...
1595,339.127679,,Ada_BF,parallel,training
1596,229.295153,,Ada_BF,parallel,training
1597,206.562303,,Ada_BF,parallel,training
1598,204.152578,,Ada_BF,parallel,training


In [130]:
df_all_train.dropna()

Unnamed: 0,total_time,electricity,method,running_method,stage
0,0.055638,0.0,bloom_filter,seq,training
1,0.067798,0.000001,bloom_filter,seq,training
2,0.054271,0.0,bloom_filter,seq,training
3,0.056672,0.0,bloom_filter,seq,training
4,0.056349,0.0,bloom_filter,seq,training
...,...,...,...,...,...
864,0.120789,0.000001,bloom_filter,parallel,training
865,0.080597,0.000001,bloom_filter,parallel,training
866,0.248454,0.000002,bloom_filter,parallel,training
867,0.609097,0.000004,bloom_filter,parallel,training


In [127]:
bloom_filter=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='bl')

Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='disjoint_Ada_BF')

bloom_filter_opt,thres_opt=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='learned_BF')

bloom_filter_opt_ada, thresholds_opt_ada, k_max_opt=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='Ada_BF')

False positive items: 1989, Number of groups: 8, c = 1.800000
False positive items: 1904, Number of groups: 8, c = 1.900000
False positive items: 1978, Number of groups: 8, c = 2.000000
False positive items: 2001, Number of groups: 8, c = 2.100000
False positive items: 1875, Number of groups: 9, c = 1.800000
False positive items: 1895, Number of groups: 9, c = 1.900000
False positive items: 1982, Number of groups: 9, c = 2.000000
False positive items: 2025, Number of groups: 9, c = 2.100000
False positive items: 1930, Number of groups: 10, c = 1.800000
False positive items: 1866, Number of groups: 10, c = 1.900000
False positive items: 1972, Number of groups: 10, c = 2.000000
False positive items: 2038, Number of groups: 10, c = 2.100000
False positive items: 1922, Number of groups: 11, c = 1.800000
False positive items: 1843, Number of groups: 11, c = 1.900000
False positive items: 1856, Number of groups: 11, c = 2.000000
False positive items: 1956, Number of groups: 11, c = 2.100000


In [128]:
def get_testing_time(data,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,model):
    
    if model=='bl':
        negative_sample = data.loc[(data['label']==-1)]
        url_negative = negative_sample['url']
        n1 = bloom_filter.test(url_negative, single_key=False)
        print('False positive items: ', sum(n1))
        
    elif model=='disjoint_Ada_BF':
        negative_sample = data.loc[(data['label']==-1)]
        ML_positive = negative_sample.loc[(negative_sample['score'] >= thresholds_opt[-2]), 'url']
        url_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'url']
        score_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'score']
        test_result = np.zeros(len(url_negative))
        ss = 0
        for score_s, url_s in zip(score_negative, url_negative):
            ix = min(np.where(score_s < thresholds_opt)[0]) - 1
            if ix >= non_empty_ix_opt:
                test_result[ss] = Bloom_Filters_opt[ix].test(url_s)
            else:
                test_result[ss] = 0
            ss += 1
        FP_items = sum(test_result) + len(ML_positive)
        FPR = FP_items/len(url_negative)
        print('False positive items: {}; FPR: {}; Size of quries: {}'.format(FP_items, FPR, len(url_negative)))
        
    elif model=='learned_BF':
        negative_sample = data.loc[(data['label']==-1)]
        ML_positive = negative_sample.loc[(negative_sample['score'] > thres_opt), 'url']
        bloom_negative = negative_sample.loc[(negative_sample['score'] <= thres_opt), 'url']
        score_negative = negative_sample.loc[(negative_sample['score'] < thres_opt), 'score']
        BF_positive = bloom_filter_opt.test(bloom_negative, single_key = False)
        FP_items = sum(BF_positive) + len(ML_positive)
        print('False positive items: %d' % FP_items)

    elif model=='Ada_BF':
        negative_sample = data.loc[(data['label']==-1)]
        ML_positive = negative_sample.loc[(negative_sample['score'] >= thresholds_opt_ada[-2]), 'url']
        url_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt_ada[-2]), 'url']
        score_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt_ada[-2]), 'score']
        test_result = np.zeros(len(url_negative))
        ss = 0
        for score_s, url_s in zip(score_negative, url_negative):
            ix = min(np.where(score_s < thresholds_opt_ada)[0])
            # thres = thresholds[ix]
            k = k_max_opt - ix
            test_result[ss] = bloom_filter_opt_ada.test(url_s, k)
            ss += 1
        FP_items = sum(test_result) + len(ML_positive)
        print('False positive items: %d' % FP_items)
       
    else:
        return('invalid model, model must be one of bl,disjoint_Ada_BF,learned_BF,Ada_BF')
    

In [129]:
%%capture
##collecting 800 
func_args_bl = (data, bloom_filter, Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt,
                bloom_filter_opt, thres_opt, bloom_filter_opt_ada, thresholds_opt_ada,
                k_max_opt, 'bl')
func_args_dis_ada= (data, bloom_filter, Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt,
                bloom_filter_opt, thres_opt, bloom_filter_opt_ada, thresholds_opt_ada,
                k_max_opt, 'disjoint_Ada_BF')
func_args_lbf= (data, bloom_filter, Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt,
                bloom_filter_opt, thres_opt, bloom_filter_opt_ada, thresholds_opt_ada,
                k_max_opt, 'learned_BF')
func_args_ada = (data, bloom_filter, Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt,
                bloom_filter_opt, thres_opt, bloom_filter_opt_ada, thresholds_opt_ada,
                k_max_opt, 'Ada_BF')

df_bl_test=simus(get_testing_time, *func_args_bl, num_runs=n_iters)

df_dis_ada_test=simus(get_testing_time,*func_args_dis_ada,num_runs=n_iters)

df_lbf_test=simus(get_testing_time,*func_args_lbf,num_runs=n_iters)

df_ada_test=simus(get_testing_time,*func_args_ada,num_runs=n_iters)



df_bl_test_para=parallel_simus(get_testing_time,*func_args_bl,num_runs=n_iters,num_cpus=num_cpu)

df_dis_ada_test_para=parallel_simus(get_testing_time,*func_args_bl,num_runs=n_iters,num_cpus=num_cpu)

df_lbf_test_para=parallel_simus(get_testing_time,*func_args_dis_ada,num_runs=n_iters,num_cpus=num_cpu)

df_ada_test_para=parallel_simus(get_testing_time,*func_args_ada,num_runs=n_iters,num_cpus=num_cpu)

KeyboardInterrupt: 

In [None]:
df_bl_test['method']='bloom_filter'
df_dis_ada_test['method']='disjoint_Ada_BF'
df_lbf_test['method']='learned_bf'
df_ada_test['method']='Ada_BF'

df_bl_test_para['method']='bloom_filter'
df_dis_ada_test_para['method']='disjoint_Ada_BF'
df_lbf_test_para['method']='learned_bf'
df_ada_test_para['method']='Ada_BF'

In [None]:
df_bl_test['running_method']='seq'
df_dis_ada_test['running_method']='seq'
df_lbf_test['running_method']='seq'
df_ada_test['running_method']='seq'

df_bl_test_para['running_method']='parallel'
df_dis_ada_test_para['running_method']='parallel'
df_lbf_test_para['running_method']='parallel'
df_ada_test_para['running_method']='parallel'

In [None]:
df_bl_test['stage']='test'
df_dis_ada_test['stage']='test'
df_lbf_test['stage']='test'
df_ada_test['stage']='test'

df_bl_test_para['stage']='test'
df_dis_ada_test_para['stage']='test'
df_lbf_test_para['stage']='test'
df_ada_test_para['stage']='test'

In [None]:
df_all_test=pd.concat([df_bl_test, df_dis_ada_test,df_lbf_test,df_ada_test,
                      df_bl_test_para,df_dis_ada_test_para,df_lbf_test_para,df_ada_test_para], ignore_index=True)

In [None]:
df_all_test

Unnamed: 0,total_time,electricity,method,running_method,stage
0,0.854022,4e-06,bloom_filter,seq,test
1,3.48404,1.8e-05,disjoint_Ada_BF,seq,test
2,2.215681,1.1e-05,learned_bf,seq,test
3,3.362883,1.7e-05,Ada_BF,seq,test
4,0.876784,5e-06,bloom_filter,parallel,test
5,0.90159,5e-06,disjoint_Ada_BF,parallel,test
6,3.645699,1.9e-05,learned_bf,parallel,test
7,3.337062,1.7e-05,Ada_BF,parallel,test


In [None]:
df_all=pd.concat([df_all_train,df_all_test], ignore_index=True)
df_all

Unnamed: 0,total_time,electricity,method,running_method,stage
0,0.005812,0.0,bloom_filter,seq,training
1,0.008353,0.0,bloom_filter,seq,training
2,0.000000,,bloom_filter,seq,training
3,0.002000,0.0,bloom_filter,seq,training
4,0.000000,0.0,bloom_filter,seq,training
...,...,...,...,...,...
3195,0.004940,,Ada_BF,parallel,test
3196,0.004093,,Ada_BF,parallel,test
3197,0.013877,,Ada_BF,parallel,test
3198,0.000000,,Ada_BF,parallel,test


In [None]:
df_all['running']='gpu'
df_all.to_csv('emissions_bl_gpu.csv', index=False)