In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
darknet_english = pd.read_csv('darknet_english_cleaned.csv', encoding='latin1')
darknet_english.head()

Unnamed: 0,url,body_stripped,language,general_category,specific category
0,http://3h42ncbglpxvc6e5.onion/disclaimer,Apple Market Stolen Carded Merchandise iPhone ...,en,Marketplace,Financial Crime
1,http://naturetome2v7rpsvy4ba3cve35y6llpfcpomvj...,Talk Gout NatureVault Talk Gout Password requi...,en,Forums,General
2,http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf...,public log Hidden Wiki Help public log From Hi...,en,Wiki,General
3,http://6tn2ejdphoveywwt6pc2sbaez62bytq4vr4xd2f...,DNMAdsDenmark Breaking Market Forums data data...,en,Forums,Narcotics
4,http://mm75rpdxcspr7qee.onion/watch/?v=uQL2vvf...,Atheists Forces Nature Immaterial MGTOW Mirror...,en,Unknown,-


In [3]:
subdataset = darknet_english[:200][['body_stripped','general_category']]
subdataset.dropna(inplace=True)
subdataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 199
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   body_stripped     193 non-null    object
 1   general_category  193 non-null    object
dtypes: object(2)
memory usage: 4.5+ KB


In [4]:
subdataset.body_stripped = subdataset.body_stripped.apply(lambda x: x.lower())
train_set = subdataset[:180]
test_set = subdataset[180:]

le = LabelEncoder()
train_set['general_category'] = le.fit_transform(train_set['general_category'])
test_set['general_category'] = le.fit_transform(test_set['general_category'])

train_set.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['general_category'] = le.fit_transform(train_set['general_category'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['general_category'] = le.fit_transform(test_set['general_category'])


Unnamed: 0,body_stripped,general_category
0,apple market stolen carded merchandise iphone ...,2
1,talk gout naturevault talk gout password requi...,1
2,public log hidden wiki help public log from hi...,6
3,dnmadsdenmark breaking market forums data data...,1
4,atheists forces nature immaterial mgtow mirror...,4


In [5]:
train_set['label'] = train_set['general_category']
train_set.drop(['general_category'], axis=1, inplace=True)

test_set['label'] = test_set['general_category']
test_set.drop(['general_category'], axis=1, inplace=True)

train_set.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set['label'] = train_set['general_category']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['label'] = test_set['general_category']


Unnamed: 0,body_stripped,label
0,apple market stolen carded merchandise iphone ...,2
1,talk gout naturevault talk gout password requi...,1
2,public log hidden wiki help public log from hi...,6
3,dnmadsdenmark breaking market forums data data...,1
4,atheists forces nature immaterial mgtow mirror...,4


In [6]:
#changing the pandas dataframe to dataset
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_set)
test_dataset = Dataset.from_pandas(test_set)

ds_dict = {'train': train_dataset, 'test': test_dataset}
dataset = DatasetDict(ds_dict).remove_columns(["__index_level_0__"])
dataset["test"][0]

{'body_stripped': 'download acres cbfm orarbg free torrent rarbg home movies games music anime apps other books pages login home catalog office selection movies home movies games music anime apps other books pages search movies shows games music anime apps other reset latest movies trending category movies remux movies movies remux movies movies movies movies remux movies movies remux bluray movies bluray movies trending movies movies movies movies movies evilangel tonightsgirlfriend black tushyraw vixen trending show show bluray trending flac quality music kbps music music trending music acres cbfm orarbg torrent acres cbfm orarbg torrent torrent copy magnet thumbnail similar posts name uploaded size acres yifi acres yifi acres amzn webrip tepes orarbg orarbg acres webrip orarbg orarbg acres webrip rarbg orarbg orarbg acres cbfm orarbg orarbg acres cbfm orarbg orarbg more uploader orarbg info hash category movies size added midnight peers seeders leechers update multiple quality avail

In [7]:
#preprocessing the data
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['body_stripped'], truncation=True)

tokenized_darknet = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

In [8]:
import evaluate

accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# training labels
id2label = {1: "Forums", 2: "Marketplace", 4:"Unknown", 6: "Wiki"}
label2id = {"Forums": 1, "Marketplace": 2,"Unknown":4,"Wiki":6}

In [9]:
tokenized_darknet["train"]

Dataset({
    features: ['body_stripped', 'label', 'input_ids', 'attention_mask'],
    num_rows: 180
})

In [10]:
from transformers import create_optimizer, TFAutoModelForSequenceClassification
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_darknet["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

model = TFAutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4, id2label=id2label, label2id=label2id)

tf_train_set = model.prepare_tf_dataset(
    tokenized_darknet["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_darknet["test"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [13]:
from transformers.keras_callbacks import KerasMetricCallback

model.compile(optimizer=optimizer)
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
callbacks = [metric_callback]

model.fit(tf_train_set, epochs=3, callbacks=callbacks, validation_data=tf_validation_set)

Epoch 1/3


ResourceExhaustedError: Graph execution error:

Detected at node 'tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._3/ffn/Gelu/truediv' defined at (most recent call last):
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\traitlets\config\application.py", line 1046, in launch_instance
      app.start()
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2914, in run_cell
      result = self._run_cell(
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2960, in _run_cell
      return runner(coro)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3185, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3377, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\shrey\AppData\Local\Temp/ipykernel_29320/2328961679.py", line 7, in <module>
      model.fit(tf_train_set, epochs=3, callbacks=callbacks, validation_data=tf_validation_set)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_tf_utils.py", line 1658, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_tf_utils.py", line 712, in run_call_with_unpacked_inputs
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\distilbert\modeling_tf_distilbert.py", line 720, in call
      distilbert_output = self.distilbert(
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_tf_utils.py", line 712, in run_call_with_unpacked_inputs
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\distilbert\modeling_tf_distilbert.py", line 403, in call
      tfmr_output = self.transformer(
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\distilbert\modeling_tf_distilbert.py", line 315, in call
      for i, layer_module in enumerate(self.layer):
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\distilbert\modeling_tf_distilbert.py", line 319, in call
      layer_outputs = layer_module(hidden_state, attn_mask, head_mask[i], output_attentions, training=training)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\distilbert\modeling_tf_distilbert.py", line 276, in call
      ffn_output = self.ffn(sa_output, training=training)  # (bs, seq_length, dim)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\models\distilbert\modeling_tf_distilbert.py", line 230, in call
      x = self.activation(x)
    File "c:\Users\shrey\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\activations.py", line 359, in gelu
      return tf.nn.gelu(x, approximate)
Node: 'tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._3/ffn/Gelu/truediv'
failed to allocate memory
	 [[{{node tf_distil_bert_for_sequence_classification/distilbert/transformer/layer_._3/ffn/Gelu/truediv}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_13293]