In [1]:
# Installing Spacy library

!pip install spacy
!pip install spacy-transformers



In [2]:
import spacy


In [3]:
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
     -------------------------------------- 777.4/777.4 MB 4.5 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
# Importing libraries

import pandas as pd
from datetime import datetime
import spacy
import spacy_transformers

# Storing docs in binary format
from spacy.tokens import DocBin

In [5]:
import torch

torch.cuda.is_available()

False

In [6]:
# Reading the dataset
df = pd.read_csv("all-data.csv", encoding='latin-1', header = None)
df = df.rename(columns = {0:'Sentiment', 1:'Text'})

In [7]:
train = df.sample(frac = 0.8, random_state = 25)
test = df.drop(train.index)

In [8]:
nlp=spacy.load("en_core_web_lg")


In [9]:
train['tuples'] = train.apply(lambda row: (row['Text'],row['Sentiment']), axis=1)
train = train['tuples'].tolist()

In [10]:
test['tuples'] = test.apply(lambda row: (row['Text'],row['Sentiment']), axis=1)
test = test['tuples'].tolist()

In [11]:
def document(data):
  text = []
  for doc, label in nlp.pipe(data, as_tuples = True):
    if (label=='positive'):
      doc.cats['positive'] = 1
      doc.cats['negative'] = 0
      doc.cats['neutral']  = 0
    elif (label=='negative'):
      doc.cats['positive'] = 0
      doc.cats['negative'] = 1
      doc.cats['neutral']  = 0
    else:
      doc.cats['positive'] = 0
      doc.cats['negative'] = 0
      doc.cats['neutral']  = 1
    text.append(doc)
  
  return(text)

In [12]:
start_time = datetime.now()

#passing the train dataset into function 'document'
train_docs = document(train)

#Creating binary document using DocBin function in spaCy
doc_bin = DocBin(docs = train_docs)

#Saving the binary document as train.spacy
doc_bin.to_disk("train.spacy")
end_time = datetime.now()

#Printing the time duration for train dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:08.911136


In [13]:
# Calculate the time for converting into binary document for test dataset

start_time = datetime.now()

#passing the test dataset into function 'document'
test_docs = document(test)
doc_bin = DocBin(docs = test_docs)
doc_bin.to_disk("valid.spacy")
end_time = datetime.now()

#Printing the time duration for test dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:02.109677


In [17]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [18]:
start_time = datetime.now()

!python -m spacy train config.cfg --verbose --output ./output_updated

end_time = datetime.now()

print('Duration: {}'.format(end_time - start_time))

[i] Saving to output directory: output_updated
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'textcat']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.04        0.00    0.00
  0     200          1.45          8.46       24.52    0.25
  0     400          1.36          5.82       24.42    0.24
  0     600          1.17          2.94       41.54    0.42
  1     800          1.07          1.73       42.86    0.43
  1    1000          1.50          0.97       63.71    0.64
  2    1200          0.57          0.51       63.01    0.63
  3    1400          1.31          0.26       69.91    0.70
  4    1600          1.39          0.14       73.26    0.73
  5    1800          2.67          0.07       71.82    0.72
  6    2000          2.74          0.04       65.78    0.66
  8    2200          3.76          0.03       70.17    0.

[2022-04-10 00:01:23,168] [INFO] Set up nlp object from config
[2022-04-10 00:01:23,174] [DEBUG] Loading corpus from path: valid.spacy
[2022-04-10 00:01:23,175] [DEBUG] Loading corpus from path: train.spacy
[2022-04-10 00:01:23,175] [INFO] Pipeline: ['tok2vec', 'textcat']
[2022-04-10 00:01:23,178] [INFO] Created vocabulary
[2022-04-10 00:01:24,073] [INFO] Added vectors: en_core_web_lg
[2022-04-10 00:01:24,974] [INFO] Finished initializing nlp object
[2022-04-10 00:01:29,350] [INFO] Initialized pipeline components: ['tok2vec', 'textcat']
[2022-04-10 00:01:29,357] [DEBUG] Loading corpus from path: valid.spacy
[2022-04-10 00:01:29,358] [DEBUG] Loading corpus from path: train.spacy
[2022-04-10 00:01:29,403] [DEBUG] Removed existing output directory: output_updated\model-last


In [33]:
#Test the data from the best model
nlp = spacy.load("output_updated/model-best")
demo = nlp("Adjusted for changes in the Group structure , the Division 's net sales increased by 1.7 % .")
print(demo.cats)

{'positive': 0.977059543132782, 'negative': 0.0011022855760529637, 'neutral': 0.021838193759322166}


In [20]:
import torch

torch.cuda.is_available()

False