In [None]:
!python3 -m pip install spacy==3.7.4
!python3 -m pip install spacy-transformers==1.3.4

In [None]:
!mkdir data 
!wget -P data https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy-Second-Edition./main/chapter_06/data/amazon_food_reviews.csv

In [None]:
import spacy
from spacy.training import Example

nlp = spacy.blank("en")
review_text = 'This Hot chocolate is very good. It has just the right amount of milk chocolate flavor. The price is a very good deal and more than worth it!'
doc = nlp(review_text)
annotation = {"cats": {"positive": 1, "negative": 0}}
example = Example.from_dict(doc, annotation)

In [15]:
import pandas as pd 
import spacy
from spacy.training import Example  

df = pd.read_csv("data/amazon_food_reviews.csv") 
 
df_train = df.sample(frac=0.8,random_state=200) 
df_test = df.drop(df_train.index) 
df_test.to_json("data/df_dev.json") 

nlp = spacy.blank("en") 

TRAIN_EXAMPLES = [] 
for _,row in df_train.iterrows(): 
    if row["positive_review"] == 1: 
        annotation = {"cats": {"positive": 1, "negative": 0}} 
    else: 
        annotation = {"cats": {"negative": 1, "positive": 0}} 
    example = Example.from_dict(nlp(row["text"]), annotation) 
    TRAIN_EXAMPLES.append(example) 

In [5]:
import spacy 
from spacy.training import Example 

nlp = spacy.blank("en") 
textcat = nlp.add_pipe("textcat") 
textcat.initialize(lambda: TRAIN_EXAMPLES, nlp=nlp) 

In [8]:
import spacy 
from spacy.util import minibatch 
import random 
 
nlp = spacy.blank("en") 
textcat = nlp.add_pipe("textcat") 
textcat.initialize(lambda: TRAIN_EXAMPLES, nlp=nlp) 

optimizer = nlp.resume_training() 

for epoch in range(40): 
    random.shuffle(TRAIN_EXAMPLES) 
    batches = minibatch(TRAIN_EXAMPLES, size=200) 
    losses = {} 
    for batch in batches: 
        nlp.update( 
                batch,    
                losses=losses, 
                sgd=optimizer, 
            ) 

    if epoch % 10 == 0: 
            print(epoch, "Losses", losses) 

print(epoch, "Losses", losses) 

0 Losses {'textcat': 3.017495170235634}
10 Losses {'textcat': 0.023405539178156687}
20 Losses {'textcat': 0.0029252148409744105}
30 Losses {'textcat': 5.756459904648903e-05}
39 Losses {'textcat': 3.132792653764982e-10}


In [10]:
text = "Smoke Paprika My mother uses it for allot of dishes, but this particular one, doesn't compare to anything she had.  It is now being used for a decoration on the spice shelf and I will never use it and ruin a dish again. I have tried using just a little bit, thinking it was stronger than her's. And I am a decent cook. But this does not taste like the smoke paprika that I have had in the past.  Sorry I don't recommend this product at all." 
doc = nlp(text) 
print("Example 1", doc.cats) 

text = "Terrible Tasting for me The Teechino Caffeine-Free Herbal Coffee, Mediterranean Vanilla Nut tasted undrinkable to me. It lacked a deep, full-bodied flavor, which Cafix and Pero coffee-like substitute products have. I wanted to try something new, and for me, this substitute coffee drink wasn't my favorite." 
doc = nlp(text) 
print("Example 2", doc.cats) 

text = "Dishwater If I had a choice of THIS or nothing, I'd go with nothing. Of all the K-cups I've tasted - this is the worst. Very weak and if you close your eyes and think really hard about it, maybe you can almost taste cinnamon. Blech." 
doc = nlp(text) 
print("Example 3", doc.cats) 

Example 1 {'positive': 2.0541139747365378e-05, 'negative': 0.9999794960021973}
Example 2 {'positive': 2.677466454770183e-06, 'negative': 0.9999973773956299}
Example 3 {'positive': 0.9998534917831421, 'negative': 0.00014646562340203673}


In [11]:
import spacy 

nlp = spacy.blank("en")  

config = { 
    "model": { 
        "@architectures": "spacy-transformers.TransformerModel.v3", 
        "name": "roberta-base" 
    } 
} 

nlp.add_pipe("transformer", config=config) 
nlp.initialize()  

doc = nlp("Dishwater If I had a choice of THIS or nothing, I'd go with nothing. Of all the K-cups I've tasted - this is the worst. Very weak and if you close your eyes and think really hard about it, maybe you can almost taste cinnamon. Blech.") 
print(doc._.trf_data) 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TransformerData(wordpieces=WordpieceBatch(strings=[['<s>', 'D', 'ish', 'water', 'ĠIf', 'ĠI', 'Ġhad', 'Ġa', 'Ġchoice', 'Ġof', 'ĠTHIS', 'Ġor', 'Ġnothing', ',', 'ĠI', "'d", 'Ġgo', 'Ġwith', 'Ġnothing', '.', 'ĠOf', 'Ġall', 'Ġthe', 'ĠK', '-', 'c', 'ups', 'ĠI', "'ve", 'Ġtasted', 'Ġ-', 'Ġthis', 'Ġis', 'Ġthe', 'Ġworst', '.', 'ĠVery', 'Ġweak', 'Ġand', 'Ġif', 'Ġyou', 'Ġclose', 'Ġyour', 'Ġeyes', 'Ġand', 'Ġthink', 'Ġreally', 'Ġhard', 'Ġabout', 'Ġit', ',', 'Ġmaybe', 'Ġyou', 'Ġcan', 'Ġalmost', 'Ġtaste', 'Ġcinnamon', '.', 'ĠBle', 'ch', '.', '</s>']], input_ids=array([[    0,   495,  1173,  5412,   318,    38,    56,    10,  2031,
            9, 10652,    50,  1085,     6,    38,  1017,   213,    19,
         1085,     4,  1525,    70,     5,   229,    12,   438,  4489,
           38,   348, 29143,   111,    42,    16,     5,  2373,     4,
        12178,  3953,     8,   114,    47,   593,   110,  2473,     8,
          206,   269,   543,    59,    24,     6,  2085,    47,    64,
          818,  5840, 2

In [14]:
import pandas as pd 
import spacy 
from spacy.tokens import DocBin 
 

df = pd.read_csv("data/amazon_food_reviews.csv") 

df_train = df.sample(frac=0.8,random_state=200) 
nlp = spacy.blank("en") 

db = DocBin() 

for _,row in df_train.iterrows(): 
    doc = nlp(row["text"]) 
    if row["positive_review"] == 1: 
        doc.cats = {"positive": 1, "negative": 0} 
    else: 
        doc.cats = {"positive": 0, "negative": 1} 
    db.add(doc) 

db.to_disk("data/train.spacy") 

In [16]:
import spacy 
from spacy.tokens import DocBin 
from pathlib import Path  

def convert_dataset(lang: str, input_path: Path, output_path: Path): 
    nlp = spacy.blank(lang) 
    db = DocBin() 
    df = pd.read_json(input_path) 
    for _,row in df.iterrows(): 
        doc = nlp.make_doc(row["Text"]) 
        if row["positive_review"] == 1: 
            doc.cats = {"positive": 1, "negative": 0} 
        else: 
            doc.cats = {"negative": 1, "positive": 0} 
        db.add(doc) 
    db.to_disk(output_path)  

convert_dataset("en", "data/df_dev.json", "data/dev.spacy") 

In [17]:
!python -m spacy --help

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...

  spaCy Command-line Interface

  DOCS: https://spacy.io/api/cli

Options:
  --install-completion [bash|zsh|fish|powershell|pwsh]
                                  Install completion for the specified shell.
  --show-completion [bash|zsh|fish|powershell|pwsh]
                                  Show completion for the specified shell, to
                                  copy it or customize the installation.
  --help                          Show this message and exit.

Commands:
  apply           Apply a trained pipeline to documents to get predictions.
  assemble        Assemble a spaCy pipeline from a config file.
  benchmark       Commands for benchmarking pipelines.
  convert         Convert files into json or DocBin format for training.
  debug           Suite of helpful commands for debugging and profiling.
  download        Download compatible trained pipeline from the default...
  evaluate        Evaluate a trained pipeline.
 

In [23]:
!python3 -m spacy init config config_without_transformer.cfg --lang "en" --pipeline "textcat"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config_without_transformer.cfg
You can now add your data and train your pipeline:
python -m spacy train config_without_transformer.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [32]:
!python3 -m spacy train config_without_transformer.cfg --paths.train "data/train.spacy" --paths.dev "data/dev.spacy" --output pipeline_without_transformer/ 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Saving to output directory: pipeline_without_transformer[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       43.34    0.43
  0     200         42.36       43.34    0.43
  0     400         35.47       46.51    0.47
  0     600         31.62       57.25    0.57
  0     800         31.34       64.85    0.65
  0    1000         30.58       65.83    0.66
  0    1200         29.32       65.01    0.65
  0    1400         29.71       72.12    0.72
  0    1600         25.19       73.64    0.74
  1    1800          9.54       73.06    0.73
  1    2000         11.48       70.14    0.70
  1    2200         12.42       71.74    0.72
  2    2400          4.45       73.80    0.74
  2    2600          4.42  

In [34]:
import spacy  

nlp = spacy.load("pipeline_without_transformer/model-best")  

text = "Smoke Paprika My mother uses it for allot of dishes, but this particular one, doesn't compare to anything she had.  It is now being used for a decoration on the spice shelf and I will never use it and ruin a dish again. I have tried using just a little bit, thinking it was stronger than her's. And I am a decent cook. But this does not taste like the smoke paprika that I have had in the past.  Sorry I don't recommend this product at all." 
doc = nlp(text) 
print("Example 1", doc.cats) 

text = "Terrible Tasting for me The Teechino Caffeine-Free Herbal Coffee, Mediterranean Vanilla Nut tasted undrinkable to me. It lacked a deep, full-bodied flavor, which Cafix and Pero coffee-like substitute products have. I wanted to try something new, and for me, this substitute coffee drink wasn't my favorite." 
doc = nlp(text) 
print("Example 2", doc.cats) 

text = "Dishwater If I had a choice of THIS or nothing, I'd go with nothing. Of all the K-cups I've tasted - this is the worst. Very weak and if you close your eyes and think really hard about it, maybe you can almost taste cinnamon. Blech." 
doc = nlp(text) 
print("Example 3", doc.cats) 

Example 1 {'positive': 0.6157549023628235, 'negative': 0.3842450678348541}
Example 2 {'positive': 0.659024715423584, 'negative': 0.34097525477409363}
Example 3 {'positive': 0.27787405252456665, 'negative': 0.7221259474754333}


In [36]:
!python3 -m spacy init config config_transformer.cfg --lang "en" --pipeline "textcat" --optimize "accuracy" --gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat
- Optimize for: accuracy
- Hardware: GPU
- Transformer: roberta-base
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config_transformer.cfg
You can now add your data and train your pipeline:
python -m spacy train config_transformer.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [38]:
!python3 -m spacy train config_transformer.cfg --paths.train "data/train.spacy" --paths.dev "data/dev.spacy" --output pipeline_transformer/ --gpu-id 0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;2m✔ Created output directory: pipeline_transformer[0m
[38;5;4mℹ Saving to output directory: pipeline_transformer[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  -------------  ------------  ----------  ------
  0       0           0.00          0.25       43.34    0.43
  1     200           0.89        113.61       82.90    0.83
  2     400          15.00         78.00       85.15    0.85
  3     600          21.33         71.08       83.78    0.84
  4     800           5.65         69.04       78.33

In [39]:
import spacy  

nlp = spacy.load("pipeline_transformer/model-best")  

text = "Smoke Paprika My mother uses it for allot of dishes, but this particular one, doesn't compare to anything she had.  It is now being used for a decoration on the spice shelf and I will never use it and ruin a dish again. I have tried using just a little bit, thinking it was stronger than her's. And I am a decent cook. But this does not taste like the smoke paprika that I have had in the past.  Sorry I don't recommend this product at all." 
doc = nlp(text) 
print("Example 1", doc.cats) 

text = "Terrible Tasting for me The Teechino Caffeine-Free Herbal Coffee, Mediterranean Vanilla Nut tasted undrinkable to me. It lacked a deep, full-bodied flavor, which Cafix and Pero coffee-like substitute products have. I wanted to try something new, and for me, this substitute coffee drink wasn't my favorite." 
doc = nlp(text) 
print("Example 2", doc.cats) 

text = "Dishwater If I had a choice of THIS or nothing, I'd go with nothing. Of all the K-cups I've tasted - this is the worst. Very weak and if you close your eyes and think really hard about it, maybe you can almost taste cinnamon. Blech." 
doc = nlp(text) 
print("Example 3", doc.cats) 

Example 1 {'positive': 0.10354223847389221, 'negative': 0.8964577913284302}
Example 2 {'positive': 0.1040550246834755, 'negative': 0.8959449529647827}
Example 3 {'positive': 0.103188157081604, 'negative': 0.8968119025230408}
