# **Sentiment Analysis using spacy 3.0**

**1. Installing Libraries**

In [None]:
# Installing Spacy library

!pip install spacy==3.1.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==3.1.1
  Downloading spacy-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 26.1 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp38-cp38-manylinux2014_x86_64.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 70.1 MB/s 
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting thinc<8.1.0,>=8.0.8
  Downloading thinc-8.0.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (671 kB)
[K     |████████████████████████████████| 671 kB 71.2 MB/s 
Installing collected packages: typer, pydantic, thinc, spacy
  Attempting uninstall: typer
    Found existing installation: typer 0.7.0
    Uninstalling typer-0.7.0:
      Successfully uninstalled typer-0.7.0
  Attempting uninstall: pydantic
    Found existing installation

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers
  Downloading spacy_transformers-1.1.9-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.1 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 59.8 MB/s 
[?25hCollecting transformers<4.26.0,>=3.4.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 61.7 MB/s 
[?25hCollecting spacy<4.0.0,>=3.4.0
  Downloading spacy-3.4.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 75.7 MB/s 
Collecting thinc<8.2.0,>=8.1.0
  Downloading thinc-8.1.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (827 kB)
[K     |████████████████████████████████| 827 kB 59.0 MB/s

In [1]:
!pip install spacy-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers
  Downloading spacy_transformers-1.1.9-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.9 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 12.9 MB/s 
[?25hCollecting transformers<4.26.0,>=3.4.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 48.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 64.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 72.6 MB/s 
Inst

In [2]:
# Downloading the spaCy Transformer model "en_core_web_trf"
!python -m spacy download en_core_web_trf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-trf==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.1/en_core_web_trf-3.4.1-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 27 kB/s 
Installing collected packages: en-core-web-trf
Successfully installed en-core-web-trf-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


**2. Importing Libraries and creating Classes**

In [47]:
# Importing libraries
import pandas as pd
from datetime import datetime
import spacy
import spacy_transformers

# Storing docs in binary format
from spacy.tokens import DocBin


class AirlineDataset():
  def __init__(self):
    self.df = None
    self.path_to_dataset = None
    self.train = None
    self.test = None

  def read_dataset(self,path_to_dataset: str):
    self.path_to_dataset = path_to_dataset
    # Reading the dataset
    self.df = pd.read_csv(self.path_to_dataset, index_col=0)
    print("Dataset Read Successfully!")
  
  def show_dataset(self):
    display(self.df.head(10))

  def extract_df(self):
    return self.df
  
  def preprocess_dataset(self):
    # Removing tags, links, hashtags 
    self.df['text'] = self.df['text'].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    # Making the size of positive and negative comments equal
    df_pos = self.df[self.df['airline_sentiment']=='positive']
    df_updated = self.df[self.df['airline_sentiment']=='negative'].sample(df_pos.shape[0]).append(df_pos)

    self.df = df_updated.copy()
    print("Preprocessing Completed!")
  
  def train_test_split(self):
    #Splitting the dataset into train and test
    self.train = self.df.sample(frac = 0.8, random_state = 25)
    self.test = self.df.drop(self.train.index)
    print("Train Test Split Completed!")
    return self.train, self.test


class My_Spacy_Model(AirlineDataset):
  def __init__(self):
    self.nlp = spacy.load("en_core_web_trf")
    self.path_to_base_config = None
    self.path_to_config = None
  
  def document(self, data):
    self.data = data
    #Creating empty list called "text"
    text = []
    for doc, label in self.nlp.pipe(self.data, as_tuples = True):
      if (label=='positive'):
        doc.cats['positive'] = 1
        doc.cats['negative'] = 0
      else:
        doc.cats['positive'] = 0
        doc.cats['negative'] = 1

      #Adding the doc into the list 'text'
      text.append(doc)
    return(text)

  def binary_document(self, train, test, path_to_train, path_to_test):

    self.train = train
    self.test = test
    self.path_to_train = path_to_train
    self.path_to_test = path_to_test

    #Creating tuples
    self.train['tuples'] = self.train.apply(lambda row: (row['text'],row['airline_sentiment']), axis=1)
    self.train = self.train['tuples'].tolist()
    self.test['tuples'] = self.test.apply(lambda row: (row['text'],row['airline_sentiment']), axis=1)
    self.test = self.test['tuples'].tolist()

    '''
    Calculate the time for converting into binary document for train dataset
    Currently path is set as /content/drive/MyDrive/TrueFoundry/train.spacy 
    This is because I performed training on Google Collab.

    If running locally set path as train.spacy
    '''

    start_time = datetime.now()
    train_docs = self.document(self.train)     #passing the train dataset into function 'document'
    doc_bin = DocBin(docs = train_docs)   #Creating binary document using DocBin function in spaCy
    doc_bin.to_disk(self.path_to_train)   #Saving the binary document as train.spacy
    end_time = datetime.now()

    #Printing the time duration for train dataset
    print('Duration for train: {}'.format(end_time - start_time))

    '''
    Calculate the time for converting into binary document for test dataset
    Currently path is set as /content/drive/MyDrive/TrueFoundry/valid.spacy 
    This is because I performed training on Google Collab.

    If running locally set path as valid.spacy
    '''

    start_time = datetime.now()
    #passing the test dataset into function 'document'
    test_docs = self.document(self.test)
    print(len(test_docs))
    doc_bin = DocBin(docs = test_docs)
    doc_bin.to_disk(self.path_to_test)
    end_time = datetime.now()

    #Printing the time duration for test dataset
    print('Duration for test: {}'.format(end_time - start_time))

    print("Binary Documentation Completed!")

  
  def fill_config_from_base_config(self, path_to_base_config: str ,path_to_config: str):
    '''
    Converting base configuration into full config file
    Currently path is set as /content/drive/MyDrive/TrueFoundry/base_config.cfg and /content/drive/MyDrive/TrueFoundry/config.cfg 
    This is because I performed training on Google Collab.

    If running locally set path as base_config.cfg and config.cfg respectively.
    '''

    self.path_to_base_config = path_to_base_config
    self.path_to_config = path_to_config

    !python -m spacy init fill-config {self.path_to_base_config} {self.path_to_config}

  
  def train_model(self, path_to_output_model):
    '''
    Calculating the time for training the model
    Currently path is set as /content/drive/MyDrive/TrueFoundry/config.cfg &
    /content/drive/MyDrive/TrueFoundry/output_updated
    This is because I performed training on Google Collab.

    If running locally set path as config.cfg and output_updated respectively.
    '''

    self.path_to_output_model = path_to_output_model

    start_time = datetime.now()
    # To train the model. Enabled GPU and storing the model output in folder called output_updated
    !python -m spacy train {self.path_to_config} --verbose --gpu-id 0 --output {self.path_to_output_model}
    end_time = datetime.now()
    
    #Printing the time taken for training the model
    print('Duration: {}'.format(end_time - start_time))



**3. Reading Dataset and Pre-processing**

In [48]:
model = My_Spacy_Model()

In [49]:
path_to_dataset = "/content/drive/MyDrive/TrueFoundry/airline_sentiment_analysis.csv"

model.read_dataset(path_to_dataset=path_to_dataset)

Dataset Read Successfully!


In [50]:
model.show_dataset()

Unnamed: 0,airline_sentiment,text
1,positive,@VirginAmerica plus you've added commercials t...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
8,positive,"@virginamerica Well, I didn't…but NOW I DO! :-D"
9,positive,"@VirginAmerica it was amazing, and arrived an ..."
11,positive,@VirginAmerica I &lt;3 pretty graphics. so muc...
12,positive,@VirginAmerica This is such a great deal! Alre...
13,positive,@VirginAmerica @virginmedia I'm flying your #f...


In [51]:
df = model.extract_df()
df.head()

Unnamed: 0,airline_sentiment,text
1,positive,@VirginAmerica plus you've added commercials t...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [52]:
# Removing tags, links, hashtags etc.
model.preprocess_dataset()
model.show_dataset()

Preprocessing Completed!


  self.df['text'] = self.df['text'].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)


Unnamed: 0,airline_sentiment,text
2307,negative,u Cancelled Flighted my flight from IAD to JA...
11811,negative,issues are not with people who r nice or stor...
13952,negative,several hrs Late Flight and 140 characters wo...
7300,negative,are yall going bankrupt or is inflation just ...
11150,negative,there has been NO one at the Yuma ticket coun...
163,negative,is the website down?
1537,negative,good try but got her here safer and sooner
986,negative,really? Someone called in sick and then some...
5378,negative,too long to wait for bags when they could hav...
13943,negative,pathetic service


**4. Splitting the processed dataset**

In [53]:
# Splitting Dataset into train and test
train, test = model.train_test_split()

Train Test Split Completed!


In [54]:
print(train.shape, test.shape)


(3781, 2) (945, 2)


In [55]:
test['airline_sentiment'].value_counts()

positive    480
negative    465
Name: airline_sentiment, dtype: int64

In [56]:
train['airline_sentiment'].value_counts()

negative    1898
positive    1883
Name: airline_sentiment, dtype: int64

**5. Converting train and test dataset into binary format**

In [57]:
path_to_train = "/content/drive/MyDrive/TrueFoundry/train.spacy"
path_to_test = "/content/drive/MyDrive/TrueFoundry/valid.spacy"

model.binary_document(train,test, path_to_train=path_to_train, path_to_test=path_to_test)

Duration for train: 0:08:05.422927
945
Duration for test: 0:01:48.504463
Binary Documentation Completed!


**6. Coverting the base config file**

*   Download the base configuration file from [here](https://spacy.io/usage/training#quickstart).

  Before downloading, we need to select **textcat** under components as this is a 
  classification problem. I selected hardware **GPU** as I used google colab and 
  selected **accuracy**.
*   We can open the base configuration file in the notepad and needs to specify the path for `train = “train.spacy"` and `dev = "test.spacy"`.



  

In [41]:
path_to_base_config = '/content/drive/MyDrive/TrueFoundry/base_config.cfg'
path_to_config = '/content/drive/MyDrive/TrueFoundry/config.cfg'

model.fill_config_from_base_config(path_to_base_config, path_to_config)

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


**6. Train the model**

In [None]:
path_to_output_model = '/content/drive/MyDrive/TrueFoundry/output_updated'

model.train_model(path_to_output_model)

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/TrueFoundry/output_updated[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-12-27 09:01:21,589] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-12-27 09:01:21,599] [DEBUG] Loading corpus from path: /content/drive/MyDrive/TrueFoundry/valid.spacy
DEBUG:spacy:Loading corpus from path: /content/drive/MyDrive/TrueFoundry/valid.spacy
[2022-12-27 09:01:21,600] [DEBUG] Loading corpus from path: /content/drive/MyDrive/TrueFoundry/train.spacy
DEBUG:spacy:Loading corpus from path: /content/drive/MyDrive/TrueFoundry/train.spacy
[2022-12-27 09:01:21,600] [INFO] Pipeline: ['transformer', 'textcat']
INFO:spacy:Pipeline: ['transformer', 'textcat']
[2022-12-27 09:01:21,603] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-12-27 09:01:21,604] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Downloading: 100% 481/481 [00:00<00:00, 498kB/s]
Downloading: 100% 899k/899k

**7. Testing the model**

In [None]:
# Loading the best model from output_updated folder
# Currently path is set as /content/drive/MyDrive/TrueFoundry/output_updated/model-best because I performed training on Google Collab.
# If running locally set path as output_updated/model-best.

nlp = spacy.load("/content/drive/MyDrive/TrueFoundry/output_updated/model-best")

In [None]:
text = "Australia’s largest airline temporarily lays off 2,500 employees"
demo = nlp(text)
print(demo.cats)

{'positive': 0.0026326205115765333, 'negative': 0.9973674416542053}


In [None]:
text = "@VirginAmerica plus you've added commercials to the experience... tacky."
demo = nlp(text)
print(demo.cats)
sent = demo.cats
max(sent, key=sent.get)

{'positive': 0.9922516942024231, 'negative': 0.007748330477625132}


'positive'

In [None]:
text = "flight was good. Descent staff!"
demo = nlp(text)
print(demo.cats)
sent = demo.cats
max(sent, key=sent.get)

{'positive': 0.9237127900123596, 'negative': 0.07628725469112396}


'positive'