### Imports

In [1]:
import random
import sys

import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

sys.path.append('../')
from bloom_filter import BloomFilter
from classify_data.helpers import set_seed, transform_data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/remote_ssh_user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/remote_ssh_user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
SEED = 111
set_seed(SEED)

### Train the bloom filter

In [3]:
dataset = pd.read_json('../data/dataset_2weeks.jsonl', lines=True)
dataset = dataset[dataset.type.eq('edit')].reset_index(drop=True)

In [4]:
transformed_dataset = transform_data(dataset)
_, validation_set = train_test_split(transformed_dataset, test_size=0.4, random_state=SEED, stratify=transformed_dataset.bot)

In [5]:
model = joblib.load('../data/bot_classifier.joblib')

In [6]:
bot_preds = model.predict(validation_set.drop(columns=['bot']))

In [7]:
bots_list = validation_set.index[bot_preds].tolist()

In [8]:
bf = BloomFilter(items_count=len(bots_list), fp_prob=0.1)
for bot in bots_list:
    bf.add(bot)

In [9]:
assert bf.check(random.choice(bots_list))
assert not bf.check('some_user')

In [10]:
bf.save('../data/bloom_filter.bf')