<a href="https://colab.research.google.com/github/VittorioRossi/NLP-content-moderation/blob/main/content_mod_training_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
%%capture
%%bash
pip install spacy[transformers]
pip install spacy_cleaner
python3 -m spacy download en_core_web_md

# Model creation

In [2]:
import pandas as pd 
import numpy as np
from tqdm import tqdm
import spacy
import os



# Custom pipeline components

In [3]:
from spacy import Language

def normalization(doc):
  word_list = []
  for token in doc:
    if not (token.is_punct or token.is_space or (not token.has_vector)):
      word_list.append(token.lemma_.lower())

  spaces = np.ones(len(word_list))
  if len(word_list) > 1:
    spaces[-1] = 0 

  return spacy.tokens.Doc(doc.vocab, word_list, spaces)

def clean_pipe(docs, *args, **kwargs):
    for doc in docs:
      yield doc

normalization.pipe = clean_pipe

@Language.factory('normalizer-factory')
def normalizer_factory(nlp, name):
  return normalization

# Importing the data

In [4]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [5]:
train_data = pd.read_csv("/content/drive/MyDrive/Datasets/text-moderation/merged_train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Datasets/text-moderation/merged_test.csv")

In [6]:
LABELS = ["quality", "toxic" ,"spam"]

nlp = spacy.load('en_core_web_md')

remove_pipes = ["tok2vec", "parser", "ner"] #"tagger" "attribute_ruler",
for pipe in remove_pipes:
  nlp.remove_pipe(pipe)

nlp.add_pipe("normalizer-factory", after="lemmatizer")

doc_bin_train = spacy.tokens.DocBin()
doc_bin_test = spacy.tokens.DocBin()


for text, lab in tqdm(nlp.pipe(zip(list(train_data['text']), list(train_data['label'])), as_tuples=True)):
    for l in LABELS: text.cats[l] = 0 
    text.cats[LABELS[lab]] = True
    doc_bin_train.add(text)

for text, lab in tqdm(nlp.pipe(zip(list(test_data['text'][:10_000]), list(test_data['label'][:10_000])), as_tuples=True)):
    for l in LABELS: text.cats[l] = 0 
    text.cats[LABELS[lab]] = True
    doc_bin_test.add(text)


doc_bin_train.to_disk('./train.spacy')
doc_bin_test.to_disk('./test.spacy')

18000it [00:52, 345.70it/s] 
10000it [00:30, 331.89it/s]


# Creating config file


In [16]:
config = ("""[paths]
train = null
dev = null
vectors = "en_core_web_md"
[system]
gpu_allocator = "pytorch"

[nlp]
lang = "en"
pipeline = ["tok2vec","tagger", "attribute_ruler","lemmatizer", "normalization", "textcat_multilabel"]
batch_size = 1000

[components]

[components.lemmatizer]
source = "en_core_web_md"

[components.tagger]
source = "en_core_web_md"

[components.attribute_ruler]
source = "en_core_web_md"


[components.normalization]
factory = "normalizer-factory"

[components.tok2vec]
source = "en_core_web_md"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 1000, 2500, 2500]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3


[components.textcat_multilabel]
factory = "textcat_multilabel"

[components.textcat_multilabel.model]
@architectures = "spacy.TextCatCNN.v2"
exclusive_classes = false

[components.textcat_multilabel.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = True
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true


[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
frozen_components = ["tok2vec", "normalization", "lemmatizer", "tagger", "attribute_ruler"]
max_epochs = 5

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 512

[initialize]
vectors = ${paths.vectors} """)

with open("./base_config.cfg", "w") as f:
  f.write(config)

# Training the model

In [17]:
%%bash
python -m spacy init fill-config ./base_config.cfg ./config.cfg --code normalizer.py

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


2023-02-12 08:45:23.350407: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-12 08:45:23.350547: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-12 08:45:25.003226: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [18]:
%%bash
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./test.spacy --output ./output --code normalizer.py #--gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer',
'normalization', 'textcat_multilabel'][0m
[38;5;4mℹ Frozen components: ['tok2vec', 'normalization', 'lemmatizer',
'tagger', 'attribute_ruler'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TEXTC...  TAG_ACC  LEMMA_ACC  CATS_SCORE  SCORE 
---  ------  -------------  -------  ---------  ----------  ------
  0       0           0.24    11.51      76.80        0.00    0.29
  0     200         141.44    11.51      76.80        0.00    0.29
  0     400          84.32    11.51      76.80        0.00    0.29
  0     600          77.41    11.51      76.80        0.00    0.29
  1     800          66.30    11.51      76.80        0.00    0.29
  1    1000          63.61    11.51      76.80        0.00    0.29
  1    1200          53.12    1

2023-02-12 08:45:36.482923: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-12 08:45:36.483036: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-12 08:45:38.021261: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[2023-02-12 08:45:41,708] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2023-02-12 08:45:41,723] [INFO] Pipeline: ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'no

# Testing and saving

In [84]:
import shutil
shutil.copytree('/content/output/model-best', "/content/drive/MyDrive/Datasets/text-moderation/best-model-v2", dirs_exist_ok=True)

'/content/drive/MyDrive/Datasets/text-moderation/best-model-v2'

In [20]:
nlp_test = spacy.load('/content/output/model-best')

In [34]:
balanced_test = pd.concat((
    test_data[test_data.label == 0][:1_300], 
    test_data[test_data.label == 1][:1_300],
    test_data[test_data.label == 2][:1_300],
))

In [35]:
pred = []
for text in tqdm(nlp_test.pipe(list(balanced_test['text']))):
  pred.append(np.argmax([text.cats["quality"], text.cats["toxic"], text.cats["spam"]]))

3900it [00:23, 167.54it/s]


In [36]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, confusion_matrix

In [37]:
accuracy_score(balanced_test.label, pred)

0.8366666666666667

In [38]:
f1_score(balanced_test.label, pred,  average = "weighted")

0.8385272783326296

In [39]:
cf_matrix = confusion_matrix(balanced_test.label, pred)
print(cf_matrix)

[[1058  212   30]
 [ 170 1121    9]
 [  93  123 1084]]


### Saving the model

In [44]:
import shutil
shutil.copytree('/content/output/model-best', "/content/drive/MyDrive/Datasets/text-moderation/best-model-CNN-v1", dirs_exist_ok=True)

'/content/drive/MyDrive/Datasets/text-moderation/best-model-CNN-v1'

In [45]:
nlp_test('this ain\'t a prank, it\'s torture, she\'s not confused she\'s beyond annoyed...').cats

{'quality': 0.7039249539375305,
 'toxic': 0.31629300117492676,
 'spam': 0.014851176179945469}