In [6]:
# Installing Spacy library

!pip install spacy
!pip install spacy-transformers



In [7]:
import spacy


In [8]:
!python -m spacy download en_core_web_trf


Collecting en-core-web-trf==3.2.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.2.0/en_core_web_trf-3.2.0-py3-none-any.whl (460.2 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_trf')


In [10]:
import pandas as pd
from datetime import datetime
import spacy
import spacy_transformers
from spacy.tokens import DocBin

In [11]:
import torch

torch.cuda.is_available()

True

In [12]:
# Reading the dataset
df = pd.read_csv("all-data.csv", encoding='latin-1', header = None)
df = df.rename(columns = {0:'Sentiment', 1:'Text'})

In [13]:
train = df.sample(frac = 0.8, random_state = 25)
test = df.drop(train.index)

In [14]:
nlp=spacy.load("en_core_web_trf")


In [15]:
train['tuples'] = train.apply(lambda row: (row['Text'],row['Sentiment']), axis=1)
train = train['tuples'].tolist()

In [16]:
test['tuples'] = test.apply(lambda row: (row['Text'],row['Sentiment']), axis=1)
test = test['tuples'].tolist()

In [17]:
def document(data):
  text = []
  for doc, label in nlp.pipe(data, as_tuples = True):
    if (label=='positive'):
      doc.cats['positive'] = 1
      doc.cats['negative'] = 0
      doc.cats['neutral']  = 0
    elif (label=='negative'):
      doc.cats['positive'] = 0
      doc.cats['negative'] = 1
      doc.cats['neutral']  = 0
    else:
      doc.cats['positive'] = 0
      doc.cats['negative'] = 0
      doc.cats['neutral']  = 1
    text.append(doc)
  
  return(text)

In [18]:
start_time = datetime.now()

#passing the train dataset into function 'document'
train_docs = document(train)

#Creating binary document using DocBin function in spaCy
doc_bin = DocBin(docs = train_docs)

#Saving the binary document as train.spacy
doc_bin.to_disk("train.spacy")
end_time = datetime.now()

#Printing the time duration for train dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:04:40.480871


In [23]:
spacy.prefer_gpu()

True

In [19]:
# Calculate the time for converting into binary document for test dataset

start_time = datetime.now()

#passing the test dataset into function 'document'
test_docs = document(test)
doc_bin = DocBin(docs = test_docs)
doc_bin.to_disk("valid.spacy")
end_time = datetime.now()

#Printing the time duration for test dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:01:05.209662


In [20]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg


[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [21]:
start_time = datetime.now()

!python -m spacy train config.cfg --verbose  --gpu-id 0 --output ./output_updated

end_time = datetime.now()

print('Duration: {}'.format(end_time - start_time))

[i] Saving to output directory: output_updated
[i] Using GPU: 0
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['transformer', 'textcat']
[i] Initial learn rate: 0.0
E    #       LOSS TRANS...  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  -------------  ------------  ----------  ------
  0       0           0.00          0.03        0.00    0.00
  3     200           0.05         39.36       79.30    0.79
  6     400           0.11         10.30       86.13    0.86
 10     600           0.05          2.45       86.79    0.87
 13     800           0.03          0.17       84.96    0.85
 17    1000           0.02          0.04       85.46    0.85
 20    1200           0.05          0.32       85.46    0.85
 24    1400           0.04          0.19       85.31    0.85
 27    1600           0.03          0.28       85.34    0.85
 31    1800           0.05          0.64       84.32    0.84
 34    2000           0.03          1.51       84.13    0.84
 38    2200           0.05          1.9

[2022-04-10 01:31:29,200] [INFO] Set up nlp object from config
[2022-04-10 01:31:29,205] [DEBUG] Loading corpus from path: valid.spacy
[2022-04-10 01:31:29,206] [DEBUG] Loading corpus from path: train.spacy
[2022-04-10 01:31:29,206] [INFO] Pipeline: ['transformer', 'textcat']
[2022-04-10 01:31:29,208] [INFO] Created vocabulary
[2022-04-10 01:31:29,209] [INFO] Finished initializing nlp object

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]
Downloading: 100%|##########| 481/481 [00:00<00:00, 485kB/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]
Downloading:   3%|3         | 28.0k/878k [00:00<00:05, 161kB/s]
Downloading:  24%|##3       | 210k/878k [00:00<00:01, 680kB/s] 
Downloading:  66%|######5   | 578k/878k [00:00<00:00, 1.66MB/s]
Downloading: 100%|##########| 878k/878k [00:00<00:00, 1.86MB/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]
Downloading:   1%|          | 4.00k/446k [00:00<00:28, 16.1kB/s]
Downloading:   8%|8         | 36.0k/446k [00:00<00:05

In [22]:
#Test the data from the best model
nlp = spacy.load("output_updated/model-best")
demo = nlp("Adjusted for changes in the Group structure , the Division 's net sales increased by 1.7 % .")
print(demo.cats)

{'positive': 0.9920485615730286, 'negative': 0.0018726777052506804, 'neutral': 0.006078697275370359}
