In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 262.6 kB/s eta 0:00:49
     --------------------------------------- 0.0/12.8 MB 245.8 kB/s eta 0:00:52
     --------------------------------------- 0.1/12.8 MB 357.2 kB/s eta 0:00:36
     --------------------------------------- 0.2/12.8 MB 612.6 kB/s eta 0:00:21
      -------------------------------------- 0.3/12.8 MB 874.6 kB/s eta 0:00:15
     - -------------------------------------- 0.4/12.8 MB 1.2 MB/s eta 0:00:10
     - -------------------------------------- 0.6/12.8 MB 1.6 MB/s eta 0:00:08
     --- ------------------------------------ 1.0/12.8 MB 2.3 MB/s eta 0:00:06
     --- ----------------------------------

In [9]:
import spacy
import pandas as pd

from tqdm import tqdm
from spacy.tokens import DocBin

nlp = spacy.load("en_core_web_sm")
data_processing = False

In [10]:
df = pd.read_csv('IMDB Dataset.csv', encoding="utf8")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [11]:
df = df.sample(frac=0.5)

In [12]:
df['sentiment'].value_counts()

sentiment
positive    12568
negative    12432
Name: count, dtype: int64

In [13]:
data = [tuple(df.iloc[i].values) for i in range(df.shape[0])]

In [14]:
df['sentiment'].value_counts()

sentiment
positive    12568
negative    12432
Name: count, dtype: int64

In [15]:
train_data = data[:20000]
valid_data = data[18000:]

In [16]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    # a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        if label == 'negative':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        # we need to set the (text)cat(egory) for each document
        #doc.cats["positive"] = label
        # put them into a nice list
        docs.append(doc)
    return docs

In [17]:
# we are so far only interested in the first 5000 reviews
# this will keep the training time short.
# In practice take as much data as you can get.
# you can always reduce it to make the script even faster.
num_texts = 5000
# first we need to transform all the training data
if data_processing:
    train_docs = make_docs(train_data[:num_texts])

In [18]:

# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts//2])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

NameError: name 'train_docs' is not defined

на этом месте мы идем в https://spacy.io/usage/training#quickstart, там настраиваем под себя конфиг (textcat), копируем его руками(!) в base_config.cfg, указываем правильные пути до трейн и вэлид


In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./valid.spacy --training.max_epochs 3 --output ./output

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       33.51    0.34
  0     200         52.92       51.47    0.51
  0     400         52.75       33.16    0.33
  0     600         52.05       34.10    0.34
  0     800         53.66       62.39    0.62
  0    1000         54.27       65.70    0.66
  0    1200         46.71       65.82    0.66
  0    1400         54.77       48.86    0.49
  0    1600         60.00       71.00    0.71
  0    1800         36.74       72.68    0.73
  0    2000         51.18       58.01    0.58
  0    2200         50.85       68.17    0.68
  0    2400         52.44       72.79    0.73
  0    2600         43.87       72.51    0.73
  0    2800         48.88       67.70    0.68
  0    3000        

In [4]:
import spacy
# load thebest model from training
nlp = spacy.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(doc.cats)
    if doc.cats['positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")

type : ‘quit’ to exit
{'positive': 0.9841685891151428, 'negative': 0.015831388533115387}
the sentiment is positive
{'positive': 0.023151319473981857, 'negative': 0.9768486618995667}
the sentiment is negative
{'positive': 0.924335777759552, 'negative': 0.075664222240448}
the sentiment is positive
