In [56]:
%load_ext autoreload
%autoreload 2


import pandas as pd
import spacy
import random
import numpy as np
import time
import json

from spacy.training import Example
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL


from os import getcwd
from os.path import join, dirname

PATH_REPO = dirname(getcwd())
PATH_UTILS = join(PATH_REPO, 'utils')
PATH_DATA = join(PATH_REPO, 'data')
PATH_MODELS = join(PATH_REPO, 'models')


import sys
sys.path.append(PATH_UTILS)

pd.set_option('max_colwidth', None)

from training import create_textcat_dataset, make_textcat_predictions
from metrics import pipeline_report
from sklearn.metrics import precision_recall_fscore_support as cat_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
train = pd.read_csv(join(PATH_DATA, "training_set.csv"), engine='python')
train = train.sample(frac=1).reset_index(drop=True)
train = train.fillna(0)
train['label'] = train['label'].astype(int)
train['text'] = train['text'].astype(str)
test = pd.read_csv(join(PATH_DATA, 'test_set.csv'), engine="python")
test['label'] = np.where(test['label']=='Gratitude', 1, 0)
train

Unnamed: 0,text,label
0,USER I feel for the caretakers at my college who spent the Christmas holidays converting their office into our COVID testing site and laying a new floor so it is washable and cutting out screens etc and the SMT planning it all. Did the government not check mass test would be ok first?,0
1,"Everything and More, Right Here Daniel trains to maintain his fitness and prevent Injuries. He like the convenient location of Honiton LED and finds the staff very friendly. Daniel also felt safe returning after the COVID closure as the gym is clean and tidy. Good work Daniel",1
2,My dad had the vaccination. He wasn't sure. The practice called and doctor turned up at their place today and did it. Impressive! Thank you NHS,1
3,Sign up to receive Coronavirus (COVID19) email updates from Johns Hopkins Medicine. You'll receive the latest news and resources from their experts,0
4,USER USER USER READ THIS BMJ. Studies show your friend will not be able to go no were because vaccinated will spread delta just as unvaccinated. Yes vaccinated might not end up in hospital but unvaccinated who have built up immunity are in the same position.,0
...,...,...
16762,I realize you have nothing to do when the day come. You have more motion than the ocean. Give it ah rest sisâ€¦ðŸ˜‚ðŸ˜‚ðŸ˜‚ TrinidadandTobago Trinidad politics comedian,0
16763,It would be good to share this around Twitter as much as we can folks. Thank you. COVIDãƒ¼19 CovidVaccine mythbusting BBC News - Covid-19: Breaking down Asian vaccine myths in Lancashire,1
16764,Entry Level - Clinical Research Physician - NHS Doctors Needed! - London (Greater) (GB) - Barrington James,0
16765,I remember a surgical reg during the first wave who was redeployed to a covid ward. When asked about the GSF status for a patient - he confidently blurted out 13 ðŸ˜‚ðŸ˜‚,0


In [58]:
tweets = train['text'].values.tolist()
labels = train['label'].values.tolist()
len(tweets)

16767

In [59]:
def train_classifier(examples, nlp, n):
    
    config = {
        "threshold": 0.7,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
        }
    
    textcat = nlp.add_pipe("textcat", config=config)
    
    textcat.add_label("gratitude")
    textcat.add_label("not_gratitude")
    train_examples = [Example.from_dict(nlp.make_doc(text), label) for text,label in examples]
    textcat.initialize(lambda: train_examples, nlp=nlp)
    
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.resume_training()
        for iteration in range(64):
            random.shuffle(examples)
            for text, label in examples:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, label)
                nlp.update([example], sgd=optimizer)
            
    return nlp

def test_classifier(nlp, test):
    
    scores = make_textcat_predictions(test, nlp)
    predictions = []
    for score in scores:
        if score > 0.7:
            predictions.append(1)
        else:
            predictions.append(0)
    true = test['label'].values.tolist()
    precision, recall, fscore, support = cat_score(true, predictions, average='macro')
    
    return precision, recall, fscore

In [60]:
nlp = spacy.load("en_core_web_lg")

config = {
   "threshold": 0.8,
   "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
}

textcat = nlp.add_pipe("textcat", config=config)
TRAIN_DATA = create_textcat_dataset(tweets, labels)
train_examples = [Example.from_dict(nlp.make_doc(text), label) for text,label in TRAIN_DATA]
textcat.add_label("gratitude")
textcat.add_label("not_gratitude")

textcat.initialize(lambda: train_examples, nlp=nlp)

In [61]:
epochs = 64
with nlp.select_pipes(enable="textcat"):
  optimizer = nlp.resume_training()
  for i in range(epochs):
    random.shuffle(TRAIN_DATA)
    for text, label in TRAIN_DATA:
      doc = nlp.make_doc(text)
      example = Example.from_dict(doc, label)
      nlp.update([example], sgd=optimizer)

In [62]:
scores = make_textcat_predictions(test, nlp)
test['score'] = scores
test

Unnamed: 0.1,Unnamed: 0,text,label,score
0,0,USER Thanks lmao.,1,0.999852
1,1,USER : G’day from Down Under USER Thanks for your interest in the Territory. I’m the Chief Minister. Below are a few facts abo…,1,0.999833
2,2,USER thanks!,1,0.999908
3,3,USER Thanks. Was a 45 minute wait. Pretty frustrating especially on the back of Friday’s difficulties.,1,0.999897
4,4,"Thanks, Siri. Nailed it…",1,0.999890
...,...,...,...,...
516,520,USER : Thanks to USER For their countless efforts for humanity MillionSmiles_Balochistan,1,0.999910
517,521,video Brazil: pomegranate juice volcano 🌋 thanks miss osint USER for the info ifb travel fruitycommunity,1,0.999872
518,522,USER : Thanks USER ! Looking forward to presenting our Caregiver-Centered Care education for the health workforce who in…,1,0.999917
519,523,"USER Thanks Abigail. My comments are said in the spirit of hope that people realize how this platform can distort so much. That cult or mob mentality is the root of all evil, IMO. Thanks for always being a true friend and having my back. You are an amazing friend here. 💙",1,0.999854


In [63]:
predictions = []
for score in scores:
    if score > 0.6:
        predictions.append(1)
    else:
        predictions.append(0)
test['prediction'] = predictions

In [64]:
nlp.to_disk(join(PATH_MODELS, 'gratitude_categorizer_3'))

In [65]:
true = test['label'].values.tolist()
pipeline_report(true, predictions)

{'true positives': 184,
 'true negatives': 125,
 'false positives': 196,
 'false negatives': 16,
 'recall': 0.92,
 'precision': 0.4842105263157895,
 'f1': 0.6344827586206897,
 'accuracy': 0.46464646464646464}