### Imports

In [1]:
import random
import joblib
import pandas as pd

from bloom_filter import BloomFilter
from sklearn.metrics import classification_report, confusion_matrix

### Train the bloom filter

In [2]:
validation_set = pd.read_csv('../data/validation_set_transformed.csv', index_col='user')

In [3]:
model = joblib.load('../data/bot_classifier.joblib')

In [4]:
bot_preds = model.predict(validation_set.drop(columns=['bot']))

In [5]:
validation_set['bot_pred'] = bot_preds
bots_list = validation_set[validation_set['bot_pred'] == True].index.tolist()
print('Thera are', len(bots_list), 'bots predicted')
print(bots_list)

Thera are 66 bots predicted
['Kobott', 'WugBot', 'GoingBatty', 'Donner60', 'Formula Downforce', 'Bottlesofsmoke', 'Hotwiki', 'Soup detective', 'GreenC bot', 'RscprinterBot', 'BotHeroMaster', 'Waxworker', 'SimLibrarian', 'Minorax', 'Fadesga', 'Claireslobotomyemployee', 'Sanglahi86', 'Iveagh Gardens', 'DYKToolsBot', 'ProcBot', 'Bother659', 'GünniX', 'Filmssssssssssss', 'Edward-Woodrow', 'Vanderwaalforces', 'InceptionBot', 'Nubia86', 'DareshMohan', 'JCW-CleanerBot', 'Οἶδα', 'HeyElliott', 'TaxonBot', 'DYKUpdateBot', 'Mistico Dois', 'Drmies', 'Immanuelle', 'DarkAlphabot', 'Timtrent', 'Graham87', 'Billjones94', 'Wcquidditch', 'Chongkian', 'Plantdrew', 'Mathbot', 'SAMBOT2000xp', 'EarwigBot', 'HooptyBot', 'Eteethan', 'Frenchl', 'Pi bot', 'ShelfSkewed', 'Rodw', 'Solidest', 'GhostInTheMachine', 'IngenuityBot', 'IJVin', 'Ethobot', 'Jimmymci234', 'Sammi Brie', 'Botx123', 'Botushali', 'Citation bot', 'Marcocapelle', 'Bot2789', 'Spiderwinebottle', 'Hatchibombotar']


In [6]:
bf = BloomFilter(items_count=len(bots_list), fp_prob=0.1)
for bot in bots_list:
    bf.add(bot)

In [7]:
assert bf.check(random.choice(bots_list))
assert not bf.check('some_user')

In [8]:
str(bf)

"BloomFilter({'size': 322, 'hash_count': 3, 'fp_prob': '0.09723', 'items_stored': 66})"

In [9]:
bf.save('../data/bloom_filter.bf')

In [10]:
validation_set['bf_check'] = validation_set.index.map(lambda user: bf.check(user))

In [11]:
print(classification_report(validation_set['bot_pred'], validation_set['bf_check']))
conf_matrix = confusion_matrix(validation_set['bot_pred'], validation_set['bf_check'])
conf_matrix

              precision    recall  f1-score   support

       False       1.00      0.91      0.95     15399
        True       0.05      1.00      0.09        66

    accuracy                           0.91     15465
   macro avg       0.52      0.96      0.52     15465
weighted avg       1.00      0.91      0.95     15465



array([[14041,  1358],
       [    0,    66]])

In [12]:
total = conf_matrix.sum()
fp = conf_matrix[0][1]
print('FP rate:', fp / total)

FP rate: 0.08781118655027481
