In [1]:
import pandas as pd
import os
import csv
import logging
import numpy as np
import datetime as datetime
import pickle
from tqdm import tqdm

from happytransformer import HappyTextClassification # https://happytransformer.com/
from sklearn.metrics import confusion_matrix
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
df = pd.read_pickle('./data/df_processed.pickle')
len(df)

365200

In [3]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...


# Distilbert
https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english

In [4]:
distilbert = HappyTextClassification("DISTILBERT", "distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)

df['distilbert'] = ''
col_num = df.columns.get_loc('distilbert')

for idx, title in tqdm(enumerate(df['title'])):
    if not pd.isnull(title):
        if len(title.split(" ")) < 512:
            label = distilbert.classify_text(title).label
            df.iloc[idx, col_num] = label
        else:
            df.iloc[idx, col_num] = "TOO_LONG"
    else:
        df.iloc[idx, col_num] = ""

08/27/2021 20:29:36 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [5]:
print(distilbert.classify_text("I am blue"))
print(distilbert.classify_text("I love candy"))
print(distilbert.classify_text("I hate candy"))

TextClassificationResult(label='POSITIVE', score=0.9037488102912903)
TextClassificationResult(label='POSITIVE', score=0.9998459815979004)
TextClassificationResult(label='NEGATIVE', score=0.9976422190666199)


# Finbert
https://huggingface.co/ProsusAI/finbert

In [10]:
finbert = HappyTextClassification("FINBERT", "ProsusAI/finbert", num_labels=3)

df['finbert'] = ''
col_num = df.columns.get_loc('finbert')

for idx, title in tqdm(enumerate(df['title'])):
    if not pd.isnull(title):
        if len(title.split(" ")) < 512:
            label = finbert.classify_text(title).label
            df.iloc[idx, col_num] = label
        else:
            df.iloc[idx, col_num] = "TOO_LONG"
    else:
        df.iloc[idx, col_num] = ""

08/27/2021 21:16:51 - INFO - happytransformer.happy_transformer -   Using model: cuda
365200it [1:06:07, 92.05it/s] 


# Roberta
https://huggingface.co/roberta-large-mnli

In [9]:
roberta = HappyTextClassification("ROBERTA", "siebert/sentiment-roberta-large-english", num_labels=2)

df['roberta'] = ''
col_num = df.columns.get_loc('roberta')

for idx, title in tqdm(enumerate(df['title'])):
    if not pd.isnull(title):
        if len(title.split(" ")) < 512:
            label = roberta.classify_text(title).label
            df.iloc[idx, col_num] = label
        else:
            df.iloc[idx, col_num] = "TOO_LONG"
    else:
        df.iloc[idx, col_num] = ""

08/28/2021 16:56:33 - INFO - happytransformer.happy_transformer -   Using model: cuda
365200it [1:54:58, 52.94it/s]


### Save/Load Sentiment Rankings

In [97]:
process_sentiment = False

if process_sentiment:
    df.to_pickle('./data/df_sentiment.pickle')
else:
    df = pd.read_pickle('./data/df_sentiment.pickle')
    df['finbert'] = df['finbert'].str.upper()
    print(len(df))

365200


In [98]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,distilbert,finbert,roberta
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,POSITIVE,NEUTRAL,POSITIVE
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...,NEGATIVE,NEGATIVE,NEGATIVE


In [102]:
df_cm = df[(df['distilbert'] != "") & (df['distilbert'] != "TOO_LONG") & (df['distilbert'] != "too_long")]

In [103]:
print(df_cm['distilbert'].value_counts(), '\n')
print(df_cm['finbert'].value_counts(), '\n')
print(df_cm['roberta'].value_counts())

NEGATIVE    233689
POSITIVE    131425
Name: distilbert, dtype: int64 

NEUTRAL     198364
NEGATIVE    105979
POSITIVE     60771
Name: finbert, dtype: int64 

POSITIVE    202051
NEGATIVE    163063
Name: roberta, dtype: int64


In [105]:
cm = pd.DataFrame(confusion_matrix(df_cm['finbert'], df_cm['distilbert'], labels=['NEGATIVE', 'POSITIVE', 'NEUTRAL']), 
             columns=['NEGATIVE', 'POSITIVE', 'NEUTRAL'],
             index=['NEGATIVE', 'POSITIVE', 'NEUTRAL'])
cm['TOTAL'] = cm.sum(axis=1)
cm.loc['TOTAL']= cm.sum()
cm

Unnamed: 0,NEGATIVE,POSITIVE,NEUTRAL,TOTAL
NEGATIVE,98357,7622,0,105979
POSITIVE,24353,36418,0,60771
NEUTRAL,110979,87385,0,198364
TOTAL,233689,131425,0,365114


In [106]:
cm = pd.DataFrame(confusion_matrix(df_cm['finbert'], df_cm['roberta'], labels=['NEGATIVE', 'POSITIVE', 'NEUTRAL']), 
             columns=['NEGATIVE', 'POSITIVE', 'NEUTRAL'],
             index=['NEGATIVE', 'POSITIVE', 'NEUTRAL'])
cm['TOTAL'] = cm.sum(axis=1)
cm.loc['TOTAL']= cm.sum()
cm

Unnamed: 0,NEGATIVE,POSITIVE,NEUTRAL,TOTAL
NEGATIVE,90854,15125,0,105979
POSITIVE,8011,52760,0,60771
NEUTRAL,64198,134166,0,198364
TOTAL,163063,202051,0,365114


In [107]:
cm = pd.DataFrame(confusion_matrix(df_cm['distilbert'], df_cm['roberta'], labels=['NEGATIVE', 'POSITIVE']), 
             columns=['NEGATIVE', 'POSITIVE'],
             index=['NEGATIVE', 'POSITIVE'])
cm['TOTAL'] = cm.sum(axis=1)
cm.loc['TOTAL']= cm.sum()
cm

Unnamed: 0,NEGATIVE,POSITIVE,TOTAL
NEGATIVE,145922,87767,233689
POSITIVE,17141,114284,131425
TOTAL,163063,202051,365114


In [108]:
df_topics = pd.read_pickle("./data/df_with_topics")
print(len(df_topics))

365200


In [113]:
df_combined = pd.merge(df, df_topics[df_topics['topic_label'] != ""][['topic_id', 'topic_label']], how='left', left_index=True, right_index=True)

In [114]:
sentiment_match = np.array(df_combined[['finbert', 'distilbert', 'roberta']])
sentiment_match = np.all(np.equal(sentiment_match, sentiment_match[:,:1]), axis=1)
df_combined['sentiment_match'] = sentiment_match

In [116]:
df_sent_match = df_combined[df_combined['sentiment_match'] == True]

In [150]:
idx = 400
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['title'])
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['distilbert'])
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['topic_label'])
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['content'])

Burberry Closes US Warehouse After Workers Test Positive
NEGATIVE
['employees', 'amazon', 'warehouse', 'unsafe']
News & Analysis A total of three employees tested positive for the virus, and the facility will be closed indefinitely. VINELAND, New Jersey — The luxury brand Burberry is famed for plaid scarves and $1,800 trench coats. But for employees at its only warehouse in the US — operating as an “essential business” during the pandemic — daily standard issue in the time of Covid-19 is this: one surgical mask, one disinfectant wipe. For weeks, employees have been growing increasingly anxious about working conditions at the New Jersey facility, the British company’s gateway to the US market. This past weekend, they found out that a total of three workers had tested positive for the virus, according to a memo sent to employees and viewed by Bloomberg News. The facility will be shut indefinitely. The 300 employees at the warehouse in Vineland are among the many thousands of mostly low-p

In [140]:
#df_ner = pd.read_pickle('./data/df_ner_210717.pickle')

In [139]:
#df_ner