In [10]:
import pandas as pd
import math
import numpy as np
import time
from codecarbon import EmissionsTracker
from Bloom_filter import BloomFilter
import PLBF
import disjoint_Ada_BF
import learned_Bloom_filter
import Ada_BF
tracker = EmissionsTracker()

[codecarbon INFO @ 14:49:32] [setup] RAM Tracking...
[codecarbon INFO @ 14:49:32] [setup] GPU Tracking...
[codecarbon INFO @ 14:49:33] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 14:49:33] [setup] CPU Tracking...
[codecarbon INFO @ 14:49:34] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i9-13900KF
[codecarbon INFO @ 14:49:34] >>> Tracker's metadata:
[codecarbon INFO @ 14:49:34]   Platform system: Windows-10-10.0.22621-SP0
[codecarbon INFO @ 14:49:34]   Python version: 3.11.5
[codecarbon INFO @ 14:49:34]   CodeCarbon version: 2.3.1
[codecarbon INFO @ 14:49:34]   Available RAM : 63.699 GB
[codecarbon INFO @ 14:49:34]   CPU count: 32
[codecarbon INFO @ 14:49:34]   CPU model: 13th Gen Intel(R) Core(TM) i9-13900KF
[codecarbon INFO @ 14:49:34]   GPU count: 1
[codecarbon INFO @ 14:49:34]   GPU model: 1 x NVIDIA GeForce RTX 4090


In [11]:
data_path='datasets/URL_data.csv'
data = pd.read_csv(data_path)
negative_sample = data.loc[(data['label'] == -1)]
positive_sample = data.loc[(data['label'] == 1)]
url_negative = negative_sample['url']
url = positive_sample['url']
n = len(url)
train_negative = negative_sample.sample(frac = 0.3)

In [14]:
def get_train_time(num_group_min, num_group_max, c_min, c_max, R_sum,url,n,min_thres,max_thres,train_negative,positive_sample,model):
    if model=='bl':
        bloom_filter = BloomFilter(n, R_sum)
        bloom_filter.insert(url)
        return bloom_filter
    elif model=='disjoint_Ada_BF':
        Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt = disjoint_Ada_BF.Find_Optimal_Parameters(c_min,c_max, num_group_min, 
                                                                   num_group_max, R_sum,
                                                                   train_negative, positive_sample)
        return Bloom_Filters_opt,thresholds_opt,non_empty_ix_opt
    elif model=='learned_BF':
        bloom_filter_opt, thres_opt = learned_Bloom_filter.Find_Optimal_Parameters(max_thres, min_thres, R_sum,
                                                                                     train_negative, positive_sample)
        return bloom_filter_opt,thres_opt
            
    
    elif model=='Ada_BF':
        bloom_filter_opt, thresholds_opt, k_max_opt = Ada_BF.Find_Optimal_Parameters(c_min, c_max, num_group_min,
                                                                                      num_group_max, 
                                                                                      R_sum, train_negative,
                                                                                      positive_sample)
        return bloom_filter_opt, thresholds_opt, k_max_opt
    else:
        return('invalid model, model must be one of bl,disjoint_Ada_BF,learned_BF,Ada_BF')

        

In [15]:
from concurrent.futures import ProcessPoolExecutor

def simus(func,num_runs=100):
    times = np.zeros(num_runs)
    electricity = np.zeros(num_runs)

    for i in range(num_runs):
        tracker = EmissionsTracker()
        start_time = time.time()
        tracker.start()
        result = func
        energy_consumed = tracker.stop()
        end_time = time.time()
        times[i]=end_time - start_time
        electricity[i]=energy_consumed
    return pd.DataFrame({'total_time' : times, 'electricity' : electricity})


def parallel_simus(func, num_runs=100,num_cpus=None):
    with ProcessPoolExecutor(max_workers=num_cpus) as executor:
        results = list(executor.map(simulation_function, [func]*num_runs))

    times, electricity = zip(*results)
    times = np.array(times)
    electricity = np.array(electricity)

    return pd.DataFrame({'total_time': times, 'electricity': electricity})

In [16]:
%%capture
##collecting 800 
n_iters=200
df_bl=simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='bl'),n_iters)

df_dis_ada=simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='disjoint_Ada_BF'),n_iters)

df_lbf=simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='learned_BF'),n_iters)

df_ada=simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='Ada_BF'),n_iters)

df_bl_para=parallel_simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='bl'),n_iters,10)

df_dis_ada_para=parallel_simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='disjoint_Ada_BF'),n_iters,10)

df_lbf_para=parallel_simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='learned_BF'),n_iters,10)

df_ada_para=parallel_simus(get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='Ada_BF'),n_iters,10)

KeyboardInterrupt: 

In [None]:
df_bl['method']='bloom_filter'
df_dis_ada['method']='disjoint_Ada_BF'
df_lbf['method']='learned_bf'
df_ada['method']='Ada_BF'

df_bl_para['method']='bloom_filter'
df_dis_ada_para['method']='disjoint_Ada_BF'
df_lbf_para['method']='learned_bf'
df_ada_para['method']='Ada_BF'

In [None]:
df_bl['running_method']='seq'
df_dis_ada['running_method']='seq'
df_lbf['running_method']='seq'
df_ada['running_method']='seq'

df_bl_para['running_method']='parallel'
df_dis_ada_para['running_method']='parallel'
df_lbf_para['running_method']='parallel'
df_ada_para['running_method']='parallel'

In [None]:
df_bl['stage']='training'
df_dis_ada['stage']='training'
df_lbf['stage']='training'
df_ada['stage']='training'

df_bl_para['stage']='training'
df_dis_ada_para['stage']='training'
df_lbf_para['stage']='training'
df_ada_para['stage']='training'

In [None]:
df_all_train=pd.concat([df_bl, df_dis_ada,df_lbf,df_ada,
                        df_bl_para,df_dis_ada_para,df_lbf_para,df_ada_para], ignore_index=True)

In [None]:
bloom_filter=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='bl')

Bloom_Filters_opt, thresholds_opt, non_empty_ix_opt=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='disjoint_Ada_BF')

bloom_filter_opt,thres_opt=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='learned_BF')

bloom_filter_opt_ada, thresholds_opt_ada, k_max_opt=get_train_time(8, 12, 1.8, 2.1, 200000,url,n,0.5,0.9,
                           train_negative,positive_sample,model='Ada_BF')

In [None]:
ML_positive = negative_sample.loc[(negative_sample['score'] >= thresholds_opt[-2]), 'url']
url_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'url']
score_negative = negative_sample.loc[(negative_sample['score'] < thresholds_opt[-2]), 'score']
test_result = np.zeros(len(url_negative))
ss = 0

In [None]:
def get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,model):
    if model=='bl':
        n1 = bloom_filter.test(url_negative, single_key=False)
        print('False positive items: ', sum(n1))
        
    elif model=='disjoint_Ada_BF':
        for score_s, url_s in zip(score_negative, url_negative):
            ix = min(np.where(score_s < thresholds_opt)[0]) - 1
            if ix >= non_empty_ix_opt:
                test_result[ss] = Bloom_Filters_opt[ix].test(url_s)
            else:
                test_result[ss] = 0
            ss += 1
        FP_items = sum(test_result) + len(ML_positive)
        FPR = FP_items/len(url_negative)
        print('False positive items: {}; FPR: {}; Size of quries: {}'.format(FP_items, FPR, len(url_negative)))
        
    elif model=='learned_BF':
        BF_positive = bloom_filter_opt.test(bloom_negative, single_key = False)
        FP_items = sum(BF_positive) + len(ML_positive)
        print('False positive items: %d' % FP_items)

    elif model=='Ada_BF':
        for score_s, url_s in zip(score_negative, url_negative):
            ix = min(np.where(score_s < thresholds_opt)[0])
            # thres = thresholds[ix]
            k = k_max_opt - ix
            test_result[ss] = bloom_filter_opt.test(url_s, k)
            ss += 1
        FP_items = sum(test_result) + len(ML_positive)
        print('False positive items: %d' % FP_items)
       
    else:
        return('invalid model, model must be one of bl,disjoint_Ada_BF,learned_BF,Ada_BF')
    

In [None]:
%%capture
##collecting 800 
n_iters=200
df_bl_test=simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'bl'),n_iters)

df_dis_ada_test=simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'disjoint_Ada_BF'),n_iters)

df_lbf_test=simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'learned_BF'),n_iters)

df_ada_test=simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'Ada_BF'),n_iters)

df_bl_test_para=parallel_simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'bl'),n_iters,10)

df_dis_ada_test_para=parallel_simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'disjoint_Ada_BF'),n_iters,10)

df_lbf_test_para=parallel_simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'learned_BF'),n_iters,10)

df_ada_test_para=parallel_simus(get_testing_time(ML_positive,url_negative,score_negative,
                     test_result,ss,bloom_filter,Bloom_Filters_opt,
                     thresholds_opt,non_empty_ix_opt,bloom_filter_opt,
                    thres_opt,bloom_filter_opt_ada,thresholds_opt_ada,k_max_opt,'Ada_BF'),n_iters,10)

In [None]:
df_bl_test['method']='bloom_filter'
df_dis_ada_test['method']='disjoint_Ada_BF'
df_lbf_test['method']='learned_bf'
df_ada_test['method']='Ada_BF'

df_bl_test_para['method']='bloom_filter'
df_dis_ada_test_para['method']='disjoint_Ada_BF'
df_lbf_test_para['method']='learned_bf'
df_ada_test_para['method']='Ada_BF'

In [None]:
df_bl_test['running_method']='seq'
df_dis_ada_test['running_method']='seq'
df_lbf_test['running_method']='seq'
df_ada_test['running_method']='seq'

df_bl_test_para['running_method']='parallel'
df_dis_ada_test_para['running_method']='parallel'
df_lbf_test_para['running_method']='parallel'
df_ada_test_para['running_method']='parallel'

In [None]:
df_bl_test['stage']='test'
df_dis_ada_test['stage']='test'
df_lbf_test['stage']='test'
df_ada_test['stage']='test'

df_bl_test_para['stage']='test'
df_dis_ada_test_para['stage']='test'
df_lbf_test_para['stage']='test'
df_ada_test_para['stage']='test'

In [None]:
df_all_test=pd.concat([df_bl_test, df_dis_ada_test,df_lbf_test,df_ada_test,
                      df_bl_test_para,df_dis_ada_test_para,df_lbf_test_para,df_ada_test_para], ignore_index=True)

In [None]:
df_all=pd.concat(df_all_train,df_all_test)

In [None]:
df_emission=pd.read_csv('emissions.csv')

In [None]:
df_merged = pd.merge(df_emission, df_all, left_index=True, right_index=True)
df_merged['running']='cpu'
df_merged.to_csv('emissions_bl_cpu.csv', index=False)