In [20]:
!pip install spacy==3.6.1
!pip install ml-datasets
!python -m spacy download en_core_web_md

[0mCollecting spacy==3.6.1
  Downloading spacy-3.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting pathy>=0.10.0
  Downloading pathy-0.10.2-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.6/181.6 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (931 kB)
[2

In [1]:
import spacy
from tqdm.auto import tqdm
from spacy.tokens import DocBin
from ml_datasets import imdb
train_data, valid_data = imdb()
nlp = spacy.load("en_core_web_md")

In [2]:
valid_data[0]

("This film was reeeeeeallyyyy bad! Was it meant to be a comedy as I couldn't help laughing the whole way through it? what a waste of two hours! Donald Sutherland was wooden not that he was alone, everyone else was just as bad...and how miscast was linda hamilton???",
 'neg')

In [4]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
nlp.pipe(train_data[0], as_tuples=True)

<generator object Language.pipe at 0x7f297c592f90>

In [7]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
#     a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        if label == 'neg':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        docs.append(doc)
    return docs

In [8]:
# это для примера, можно взять больше количество текстов
num_texts = 100
train_docs = make_docs(train_data[:num_texts])

  0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

  0%|          | 0/100 [00:00<?, ?it/s]

на этом месте мы идем в https://spacy.io/usage/training#quickstart, там настраиваем под себя конфиг в разделе quickstart, копируем его руками(!) в base_config.cfg, указываем правильные пути до трейн и вэлид
но после этого некоторые поля возможно придется руками дописать, главное проверить что есть вот такие строчки:

[nlp] <br>
lang = "en" <br>
pipeline = ["textcat"] <br>
batch_size = 1000 <br>

[components] <br>

[components.textcat] <br>
factory = "textcat" <br>

или можно просто взять мой приложенный, он рабочий

In [10]:
# что здесь происходит: мы заполнили base_config, а эта команда на его основе прописывает config.cfg
! python -m spacy init fill-config base_config_361.cfg config_361.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config_361.cfg
You can now add your data and train your pipeline:
python -m spacy train config_361.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
! python -m spacy train config_361.cfg --output ./output # тренькаем модель

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       33.33    0.33
  2     200         34.82       72.92    0.73
  4     400          0.55       74.42    0.74
  6     600          0.11       70.83    0.71
  8     800          0.06       73.13    0.73
 10    1000          0.04       71.99    0.72
 12    1200          0.03       71.99    0.72
 14    1400          0.02       71.99    0.72
 16    1600          0.01       71.99    0.72
 18    1800          0.01       71.99    0.72
 20    2000          0.01       70.83    0.71
[38;5;2m✔ Saved pipeline to output directory[0m
output/model-last


In [12]:
# load thebest model from training
nlp = spacy.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(doc.cats)
    if doc.cats['positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")

type : ‘quit’ to exit
Please enter example input: nice
{'positive': 0.5137437582015991, 'negative': 0.4862562417984009}
the sentiment is positive
Please enter example input: bad review
{'positive': 0.43967217206954956, 'negative': 0.5603277683258057}
the sentiment is negative
Please enter example input: bad
{'positive': 0.460705429315567, 'negative': 0.5392945408821106}
the sentiment is negative
Please enter example input: quit
{'positive': 0.513037919998169, 'negative': 0.48696205019950867}
the sentiment is positive


{'positive': 0.02542225271463394, 'negative': 0.974577784538269}

In [13]:
nlp = spacy.load("output/model-best")

In [15]:
nlp('good review').cats

{'positive': 0.5100855231285095, 'negative': 0.4899144768714905}