### Custom SpaCy Sentiment Model

In [24]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### 1. Reading Data

In [25]:
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch
import warnings
warnings.filterwarnings("ignore")

In [37]:
BASE_PATH = 'C:/Users/naras/OneDrive/Documents/'

train_df = pd.read_csv(BASE_PATH + 'train_data.csv')
test_df = pd.read_csv( BASE_PATH + 'val_data.csv')

In [39]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [40]:
train_df.sentiment.value_counts().head(10)

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [29]:
train_df = train_df.dropna()

### 2. Training Model

In [43]:
def save_model(output_dir, nlp, new_model_name):
    output_dir = f'C:/Users/naras/OneDrive/Documents/{output_dir}'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [31]:
# pass model = nlp if you want to train on top of existing model 

def train(train_data, output_dir, n_iter=20, model=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    if model is not None:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()

        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,   # dropout - make it harder to memorise data
                    losses=losses, 
                )
            
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')

In [32]:
def get_model_out_path(sentiment):
    model_out_path = None
    if sentiment == 'Positive':
        model_out_path = 'model/model_pos'
    elif sentiment == 'Negative':
        model_out_path = 'model/model_neg'
    else:
        model_out_path = 'model/model_neu'
    return model_out_path
    

In [33]:
def get_training_data(sentiment):
    train_data = []
    for index, row in train_df.iterrows():
        if row.sentiment == sentiment:
            selected_text = row.selected_text
            text = row.text
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data

### 3. Training for Positive Sentiment

In [46]:
sentiment = 'Positive'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=30, model=None)

### 4. Training for Negative Sentiment

In [27]:
sentiment = 'negative'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=30, model=None)

Created blank 'en' model



  0%|          | 0/30 [00:00<?, ?it/s][A
  3%|▎         | 1/30 [01:03<30:45, 63.65s/it][A

Losses {'ner': 32448.104163273005}



  7%|▋         | 2/30 [02:05<29:24, 63.03s/it][A

Losses {'ner': 28607.9315576206}



 10%|█         | 3/30 [03:11<28:46, 63.93s/it][A

Losses {'ner': 27474.385467059656}



 13%|█▎        | 4/30 [04:21<28:32, 65.85s/it][A

Losses {'ner': 26524.334456737342}



 17%|█▋        | 5/30 [05:36<28:33, 68.53s/it][A

Losses {'ner': 25111.258534084744}



 20%|██        | 6/30 [06:52<28:18, 70.75s/it][A

Losses {'ner': 24586.790584234863}



 23%|██▎       | 7/30 [08:03<27:07, 70.77s/it][A

Losses {'ner': 23387.781632473416}



 27%|██▋       | 8/30 [09:17<26:22, 71.93s/it][A

Losses {'ner': 22646.530660375927}



 30%|███       | 9/30 [10:40<26:20, 75.28s/it][A

Losses {'ner': 22305.08168228498}



 33%|███▎      | 10/30 [12:04<25:56, 77.81s/it][A

Losses {'ner': 21823.755979928992}



 37%|███▋      | 11/30 [13:27<25:08, 79.40s/it][A

Losses {'ner': 21927.88091885123}



 40%|████      | 12/30 [14:51<24:12, 80.67s/it][A

Losses {'ner': 21536.340820972422}



 43%|████▎     | 13/30 [16:14<23:05, 81.48s/it][A

Losses {'ner': 20526.807224312095}



 47%|████▋     | 14/30 [17:37<21:50, 81.93s/it][A

Losses {'ner': 20443.79155534393}



 50%|█████     | 15/30 [19:05<20:56, 83.77s/it][A

Losses {'ner': 20248.692721637206}



 53%|█████▎    | 16/30 [20:34<19:54, 85.33s/it][A

Losses {'ner': 19645.345982838495}



 57%|█████▋    | 17/30 [22:04<18:45, 86.60s/it][A

Losses {'ner': 19539.19847100303}



 60%|██████    | 18/30 [23:32<17:26, 87.17s/it][A

Losses {'ner': 19249.05934585487}



 63%|██████▎   | 19/30 [25:01<16:03, 87.58s/it][A

Losses {'ner': 18981.656083014263}



 67%|██████▋   | 20/30 [26:30<14:40, 88.06s/it][A

Losses {'ner': 18849.23885227106}



 70%|███████   | 21/30 [27:59<13:14, 88.33s/it][A

Losses {'ner': 18914.11495771508}



 73%|███████▎  | 22/30 [29:22<11:35, 86.88s/it][A

Losses {'ner': 18415.536159851585}



 77%|███████▋  | 23/30 [30:46<10:02, 86.04s/it][A

Losses {'ner': 18237.536580319847}



 80%|████████  | 24/30 [32:11<08:33, 85.50s/it][A

Losses {'ner': 18216.70284592732}



 83%|████████▎ | 25/30 [33:36<07:06, 85.36s/it][A

Losses {'ner': 17520.463836810886}



 87%|████████▋ | 26/30 [34:59<05:38, 84.72s/it][A

Losses {'ner': 17384.685942739416}



 90%|█████████ | 27/30 [36:27<04:17, 85.84s/it][A

Losses {'ner': 17937.176857673243}



 93%|█████████▎| 28/30 [37:52<02:50, 85.35s/it][A

Losses {'ner': 17761.682398309917}



 97%|█████████▋| 29/30 [39:15<01:24, 84.72s/it][A

Losses {'ner': 17306.96567333045}



100%|██████████| 30/30 [40:41<00:00, 81.38s/it][A

Losses {'ner': 17339.981604679495}
Saved model to C:/Users/naras/OneDrive/Documents/models/model_neg





### 5. Training for Neutral Sentiment

In [29]:
sentiment = 'neutral'

train_data = get_training_data(sentiment)
model_path = get_model_out_path(sentiment)
train(train_data, model_path, n_iter=30, model=None)



  0%|          | 0/30 [00:00<?, ?it/s][A[A

Created blank 'en' model




  3%|▎         | 1/30 [01:37<46:54, 97.04s/it][A[A

Losses {'ner': 7195.225666267608}




  7%|▋         | 2/30 [03:07<44:19, 94.97s/it][A[A

Losses {'ner': 5499.689893892797}




 10%|█         | 3/30 [04:40<42:33, 94.58s/it][A[A

Losses {'ner': 5253.180450727642}




 13%|█▎        | 4/30 [06:19<41:27, 95.66s/it][A[A

Losses {'ner': 4840.117943400269}




 17%|█▋        | 5/30 [07:58<40:20, 96.82s/it][A[A

Losses {'ner': 4771.529730584881}




 20%|██        | 6/30 [09:39<39:10, 97.96s/it][A[A

Losses {'ner': 4937.671657114642}




 23%|██▎       | 7/30 [11:23<38:18, 99.92s/it][A[A

Losses {'ner': 4553.898479257698}




 27%|██▋       | 8/30 [13:03<36:40, 100.03s/it][A[A

Losses {'ner': 4433.269229723165}




 30%|███       | 9/30 [14:42<34:52, 99.65s/it] [A[A

Losses {'ner': 4175.732149091246}




 33%|███▎      | 10/30 [16:21<33:04, 99.25s/it][A[A

Losses {'ner': 4319.977042627618}




 37%|███▋      | 11/30 [17:59<31:23, 99.14s/it][A[A

Losses {'ner': 4262.5512088743435}




 40%|████      | 12/30 [19:47<30:28, 101.60s/it][A[A

Losses {'ner': 3962.243113312233}




 43%|████▎     | 13/30 [21:26<28:34, 100.85s/it][A[A

Losses {'ner': 4010.4470430220126}




 47%|████▋     | 14/30 [23:05<26:44, 100.30s/it][A[A

Losses {'ner': 3953.1776786612486}




 50%|█████     | 15/30 [24:44<24:59, 99.97s/it] [A[A

Losses {'ner': 3880.9906393024717}




 53%|█████▎    | 16/30 [26:24<23:19, 99.94s/it][A[A

Losses {'ner': 3985.996803130422}




 57%|█████▋    | 17/30 [28:04<21:39, 99.95s/it][A[A

Losses {'ner': 3809.7943711952857}




 60%|██████    | 18/30 [29:43<19:56, 99.68s/it][A[A

Losses {'ner': 3853.6725030516095}




 63%|██████▎   | 19/30 [31:22<18:15, 99.62s/it][A[A

Losses {'ner': 3615.8065603175373}




 67%|██████▋   | 20/30 [33:02<16:36, 99.70s/it][A[A

Losses {'ner': 3562.1061863886384}




 70%|███████   | 21/30 [34:43<15:00, 100.05s/it][A[A

Losses {'ner': 3692.2904690680853}




 73%|███████▎  | 22/30 [36:23<13:19, 99.96s/it] [A[A

Losses {'ner': 3524.601827045743}




 77%|███████▋  | 23/30 [38:04<11:41, 100.15s/it][A[A

Losses {'ner': 3516.887666672171}




 80%|████████  | 24/30 [39:43<10:00, 100.07s/it][A[A

Losses {'ner': 3649.5509815319983}




 83%|████████▎ | 25/30 [41:23<08:19, 99.91s/it] [A[A

Losses {'ner': 3519.925278164429}




 87%|████████▋ | 26/30 [43:03<06:39, 99.95s/it][A[A

Losses {'ner': 3447.797180514221}




 90%|█████████ | 27/30 [44:43<05:00, 100.02s/it][A[A

Losses {'ner': 3389.810766968707}




 93%|█████████▎| 28/30 [46:23<03:19, 99.84s/it] [A[A

Losses {'ner': 3446.2343476172073}




 97%|█████████▋| 29/30 [48:03<01:40, 100.13s/it][A[A

Losses {'ner': 3322.996241622846}




100%|██████████| 30/30 [49:44<00:00, 99.49s/it] [A[A

Losses {'ner': 3391.9902497792405}
Saved model to C:/Users/naras/OneDrive/Documents/models/model_neu





In [33]:
TRAINED_MODELS_BASE_PATH = 'C:/Users/naras/OneDrive/Documents/models/'

In [34]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text

### 6. Jaccard score on train data

In [35]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


if TRAINED_MODELS_BASE_PATH is not None:
    print("Loading Models  from ", TRAINED_MODELS_BASE_PATH)
    model_pos = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neg')
    model_neu = spacy.load(TRAINED_MODELS_BASE_PATH + 'model_neu')
        
    jaccard_score = 0
    for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
        text = row.text
        if row.sentiment == 'neutral':
            jaccard_score += jaccard(predict_entities(text, model_neu), row.selected_text)
        elif row.sentiment == 'positive':
            jaccard_score += jaccard(predict_entities(text, model_pos), row.selected_text)
        else:
            jaccard_score += jaccard(predict_entities(text, model_neg), row.selected_text) 
        
    print(f'Average Jaccard Score is {jaccard_score / train_df.shape[0]}') 

Loading Models  from  C:/Users/naras/OneDrive/Documents/models/




  0%|          | 0/27480 [00:00<?, ?it/s][A[A

  0%|          | 33/27480 [00:00<01:24, 326.70it/s][A[A

  0%|          | 66/27480 [00:00<01:23, 326.71it/s][A[A

  0%|          | 100/27480 [00:00<01:23, 329.62it/s][A[A

  0%|          | 134/27480 [00:00<01:22, 331.74it/s][A[A

  1%|          | 168/27480 [00:00<01:21, 334.14it/s][A[A

  1%|          | 204/27480 [00:00<01:20, 339.56it/s][A[A

  1%|          | 239/27480 [00:00<01:19, 341.62it/s][A[A

  1%|          | 276/27480 [00:00<01:18, 348.68it/s][A[A

  1%|          | 312/27480 [00:00<01:17, 349.95it/s][A[A

  1%|▏         | 346/27480 [00:01<01:20, 336.61it/s][A[A

  1%|▏         | 379/27480 [00:01<01:24, 320.03it/s][A[A

  1%|▏         | 411/27480 [00:01<01:24, 320.02it/s][A[A

  2%|▏         | 445/27480 [00:01<01:22, 325.73it/s][A[A

  2%|▏         | 479/27480 [00:01<01:22, 327.06it/s][A[A

  2%|▏         | 514/27480 [00:01<01:20, 333.58it/s][A[A

  2%|▏         | 548/27480 [00:01<01:20, 332.53it/s

 33%|███▎      | 9073/27480 [00:28<01:06, 276.35it/s][A[A

 33%|███▎      | 9105/27480 [00:28<01:03, 287.37it/s][A[A

 33%|███▎      | 9137/27480 [00:28<01:02, 295.61it/s][A[A

 33%|███▎      | 9170/27480 [00:28<01:00, 304.28it/s][A[A

 33%|███▎      | 9204/27480 [00:28<00:58, 313.31it/s][A[A

 34%|███▎      | 9236/27480 [00:28<00:58, 312.52it/s][A[A

 34%|███▎      | 9269/27480 [00:28<00:57, 316.68it/s][A[A

 34%|███▍      | 9305/27480 [00:28<00:55, 326.72it/s][A[A

 34%|███▍      | 9341/27480 [00:29<00:54, 334.17it/s][A[A

 34%|███▍      | 9378/27480 [00:29<00:52, 343.21it/s][A[A

 34%|███▍      | 9413/27480 [00:29<00:56, 317.09it/s][A[A

 34%|███▍      | 9446/27480 [00:29<00:57, 311.77it/s][A[A

 34%|███▍      | 9478/27480 [00:29<00:58, 306.96it/s][A[A

 35%|███▍      | 9510/27480 [00:29<00:58, 308.93it/s][A[A

 35%|███▍      | 9544/27480 [00:29<00:56, 316.75it/s][A[A

 35%|███▍      | 9577/27480 [00:29<00:56, 318.76it/s][A[A

 35%|███▍      | 9610/27

 65%|██████▍   | 17861/27480 [00:55<00:33, 289.78it/s][A[A

 65%|██████▌   | 17892/27480 [00:55<00:32, 293.85it/s][A[A

 65%|██████▌   | 17923/27480 [00:56<00:32, 297.67it/s][A[A

 65%|██████▌   | 17954/27480 [00:56<00:31, 300.37it/s][A[A

 65%|██████▌   | 17985/27480 [00:56<00:31, 300.55it/s][A[A

 66%|██████▌   | 18018/27480 [00:56<00:30, 307.96it/s][A[A

 66%|██████▌   | 18049/27480 [00:56<00:31, 299.65it/s][A[A

 66%|██████▌   | 18080/27480 [00:56<00:32, 290.76it/s][A[A

 66%|██████▌   | 18110/27480 [00:56<00:32, 287.56it/s][A[A

 66%|██████▌   | 18139/27480 [00:56<00:32, 284.86it/s][A[A

 66%|██████▌   | 18171/27480 [00:56<00:32, 290.58it/s][A[A

 66%|██████▌   | 18203/27480 [00:57<00:31, 297.99it/s][A[A

 66%|██████▋   | 18234/27480 [00:57<00:30, 301.49it/s][A[A

 66%|██████▋   | 18265/27480 [00:57<00:30, 301.31it/s][A[A

 67%|██████▋   | 18298/27480 [00:57<00:29, 307.65it/s][A[A

 67%|██████▋   | 18330/27480 [00:57<00:29, 308.58it/s][A[A

 67%|███

 96%|█████████▌| 26407/27480 [01:23<00:03, 306.40it/s][A[A

 96%|█████████▌| 26440/27480 [01:23<00:03, 307.88it/s][A[A

 96%|█████████▋| 26471/27480 [01:23<00:03, 303.98it/s][A[A

 96%|█████████▋| 26503/27480 [01:23<00:03, 306.84it/s][A[A

 97%|█████████▋| 26535/27480 [01:23<00:03, 309.74it/s][A[A

 97%|█████████▋| 26567/27480 [01:23<00:02, 304.72it/s][A[A

 97%|█████████▋| 26600/27480 [01:24<00:02, 310.11it/s][A[A

 97%|█████████▋| 26635/27480 [01:24<00:02, 318.46it/s][A[A

 97%|█████████▋| 26668/27480 [01:24<00:02, 321.84it/s][A[A

 97%|█████████▋| 26701/27480 [01:24<00:02, 323.29it/s][A[A

 97%|█████████▋| 26734/27480 [01:24<00:02, 319.64it/s][A[A

 97%|█████████▋| 26769/27480 [01:24<00:02, 326.31it/s][A[A

 98%|█████████▊| 26802/27480 [01:24<00:02, 324.51it/s][A[A

 98%|█████████▊| 26838/27480 [01:24<00:01, 332.58it/s][A[A

 98%|█████████▊| 26872/27480 [01:24<00:01, 327.04it/s][A[A

 98%|█████████▊| 26905/27480 [01:24<00:01, 321.22it/s][A[A

 98%|███

Average Jaccard Score is 0.7238812488363049





In [35]:
BASE_PATH = 'C:/Users/naras/OneDrive/Documents/'
MODELS_BASE_PATH = 'C:/Users/naras/OneDrive/Documents/models/'
MODELS_BASE_PATH2 = 'C:/Users/naras/OneDrive/Documents/models/'

test_df = pd.read_csv( BASE_PATH + 'val_data.csv')
submission_df = pd.read_csv( BASE_PATH + 'test_data.csv')

### 7. Load the Tarined Models

In [36]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    selected_text = text[ent_array[0][0]: ent_array[0][1]] if len(ent_array) > 0 else text
    return selected_text

### 8. Predict on the test dataset

In [41]:
selected_texts = []

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_pos = spacy.load(MODELS_BASE_PATH + 'model_pos')
    model_neg = spacy.load(MODELS_BASE_PATH2 + 'model_neg')
    model_neu = spacy.load(MODELS_BASE_PATH + 'model_neu')
        
    for index, row in test_df.iterrows():
        text = row.text
        output_str = ""
        if row.sentiment == 'neutral' or len(text.split()) < 4:
#             output_str = text
#             selected_texts.append(predict_entities(text, model_neu))
            selected_texts.append(text)
        elif row.sentiment == 'positive':
            selected_texts.append(predict_entities(text, model_pos))
        else:
            selected_texts.append(predict_entities(text, model_neg))
        
test_df['selected_text'] = selected_texts

Loading Models  from  C:/Users/naras/OneDrive/Documents/models/


In [42]:
test_df.head(10)

Unnamed: 0,textID,text,sentiment,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,exciting
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,Recession
3,01082688c6,happy bday!,positive,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,like
5,726e501993,that`s great!! weee!! visitors!,positive,that`s great!!
6,261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative,I THINK EVERYONE HATES ME ON HERE lol
7,afa11da83f,"soooooo wish i could, but im in school and my...",negative,completely blocked
8,e64208b4ef,and within a short time of the last clue all ...,neutral,and within a short time of the last clue all ...
9,37bcad24ca,What did you get? My day is alright.. haven`...,neutral,What did you get? My day is alright.. haven`...
