In [1]:
import pandas as pd
import os
import csv
import logging
import numpy as np
import datetime as datetime
import pickle
from tqdm import tqdm

from happytransformer import HappyTextClassification # https://happytransformer.com/
from sklearn.metrics import confusion_matrix
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
df = pd.read_pickle('./data/df_processed.pickle')
len(df)

365200

In [3]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...


# Distilbert
https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english

In [4]:
distilbert = HappyTextClassification("DISTILBERT", "distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)

df['distilbert'] = ''
col_num = df.columns.get_loc('distilbert')

for idx, title in tqdm(enumerate(df['title'])):
    if not pd.isnull(title):
        if len(title.split(" ")) < 512:
            label = distilbert.classify_text(title).label
            df.iloc[idx, col_num] = label
        else:
            df.iloc[idx, col_num] = "TOO_LONG"
    else:
        df.iloc[idx, col_num] = ""

08/27/2021 20:29:36 - INFO - happytransformer.happy_transformer -   Using model: cuda


In [5]:
print(distilbert.classify_text("I am blue"))
print(distilbert.classify_text("I love candy"))
print(distilbert.classify_text("I hate candy"))

TextClassificationResult(label='POSITIVE', score=0.9037488102912903)
TextClassificationResult(label='POSITIVE', score=0.9998459815979004)
TextClassificationResult(label='NEGATIVE', score=0.9976422190666199)


# Finbert
https://huggingface.co/ProsusAI/finbert

In [10]:
finbert = HappyTextClassification("FINBERT", "ProsusAI/finbert", num_labels=3)

df['finbert'] = ''
col_num = df.columns.get_loc('finbert')

for idx, title in tqdm(enumerate(df['title'])):
    if not pd.isnull(title):
        if len(title.split(" ")) < 512:
            label = finbert.classify_text(title).label
            df.iloc[idx, col_num] = label
        else:
            df.iloc[idx, col_num] = "TOO_LONG"
    else:
        df.iloc[idx, col_num] = ""

08/27/2021 21:16:51 - INFO - happytransformer.happy_transformer -   Using model: cuda
365200it [1:06:07, 92.05it/s] 


# Roberta
https://huggingface.co/roberta-large-mnli

In [9]:
roberta = HappyTextClassification("ROBERTA", "siebert/sentiment-roberta-large-english", num_labels=2)

df['roberta'] = ''
col_num = df.columns.get_loc('roberta')

for idx, title in tqdm(enumerate(df['title'])):
    if not pd.isnull(title):
        if len(title.split(" ")) < 512:
            label = roberta.classify_text(title).label
            df.iloc[idx, col_num] = label
        else:
            df.iloc[idx, col_num] = "TOO_LONG"
    else:
        df.iloc[idx, col_num] = ""

08/28/2021 16:56:33 - INFO - happytransformer.happy_transformer -   Using model: cuda
365200it [1:54:58, 52.94it/s]


### Save/Load Sentiment Rankings

In [2]:
process_sentiment = False

if process_sentiment:
    df.to_pickle('./data/df_sentiment.pickle')
else:
    df = pd.read_pickle('./data/df_sentiment.pickle')
    df['finbert'] = df['finbert'].str.upper()
    print(len(df))

365200


In [3]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,distilbert,finbert,roberta
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,POSITIVE,NEUTRAL,POSITIVE
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...,NEGATIVE,NEGATIVE,NEGATIVE


In [102]:
df_cm = df[(df['distilbert'] != "") & (df['distilbert'] != "TOO_LONG") & (df['distilbert'] != "too_long")]

In [103]:
print(df_cm['distilbert'].value_counts(), '\n')
print(df_cm['finbert'].value_counts(), '\n')
print(df_cm['roberta'].value_counts())

NEGATIVE    233689
POSITIVE    131425
Name: distilbert, dtype: int64 

NEUTRAL     198364
NEGATIVE    105979
POSITIVE     60771
Name: finbert, dtype: int64 

POSITIVE    202051
NEGATIVE    163063
Name: roberta, dtype: int64


In [105]:
cm = pd.DataFrame(confusion_matrix(df_cm['finbert'], df_cm['distilbert'], labels=['NEGATIVE', 'POSITIVE', 'NEUTRAL']), 
             columns=['NEGATIVE', 'POSITIVE', 'NEUTRAL'],
             index=['NEGATIVE', 'POSITIVE', 'NEUTRAL'])
cm['TOTAL'] = cm.sum(axis=1)
cm.loc['TOTAL']= cm.sum()
cm

Unnamed: 0,NEGATIVE,POSITIVE,NEUTRAL,TOTAL
NEGATIVE,98357,7622,0,105979
POSITIVE,24353,36418,0,60771
NEUTRAL,110979,87385,0,198364
TOTAL,233689,131425,0,365114


In [106]:
cm = pd.DataFrame(confusion_matrix(df_cm['finbert'], df_cm['roberta'], labels=['NEGATIVE', 'POSITIVE', 'NEUTRAL']), 
             columns=['NEGATIVE', 'POSITIVE', 'NEUTRAL'],
             index=['NEGATIVE', 'POSITIVE', 'NEUTRAL'])
cm['TOTAL'] = cm.sum(axis=1)
cm.loc['TOTAL']= cm.sum()
cm

Unnamed: 0,NEGATIVE,POSITIVE,NEUTRAL,TOTAL
NEGATIVE,90854,15125,0,105979
POSITIVE,8011,52760,0,60771
NEUTRAL,64198,134166,0,198364
TOTAL,163063,202051,0,365114


In [107]:
cm = pd.DataFrame(confusion_matrix(df_cm['distilbert'], df_cm['roberta'], labels=['NEGATIVE', 'POSITIVE']), 
             columns=['NEGATIVE', 'POSITIVE'],
             index=['NEGATIVE', 'POSITIVE'])
cm['TOTAL'] = cm.sum(axis=1)
cm.loc['TOTAL']= cm.sum()
cm

Unnamed: 0,NEGATIVE,POSITIVE,TOTAL
NEGATIVE,145922,87767,233689
POSITIVE,17141,114284,131425
TOTAL,163063,202051,365114


In [4]:
df_topics = pd.read_pickle("./data/df_with_topics")
print(len(df_topics))

365200


In [8]:
#df_combined = pd.merge(df, df_topics[df_topics['topic_label'] != ""][['topic_id', 'topic_label']], how='left', left_index=True, right_index=True)
df_combined = pd.merge(df, df_topics[['topic_id', 'topic_label']], how='left', left_index=True, right_index=True)

In [9]:
df_combined

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,distilbert,finbert,roberta,topic_id,topic_label
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,POSITIVE,NEUTRAL,POSITIVE,,
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business,labor markets closely watched segment economy ...,NEGATIVE,NEGATIVE,POSITIVE,,
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business,forgiven think tesla nasdaq tsla little big pi...,POSITIVE,POSITIVE,POSITIVE,,
3,Roberto Torres,2020-01-03,ciodive,"On the road to AI adoption, execs grapple with...",https://www.ciodive.com/news/ai-adoption-execs...,CIOs kicked off 2019 with AI as an item to wat...,tech,cios kicked ai item watch competition agenda a...,POSITIVE,NEUTRAL,POSITIVE,,
4,Alden Wicker,2020-01-06,instyle,Red Carpet Sustainability After Coronavirus Sh...,https://www.instyle.com/fashion/red-carpet-cor...,When the coronavirus pandemic is over and life...,consumer,coronavirus pandemic life returns normal celeb...,POSITIVE,NEUTRAL,POSITIVE,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369042,Polly Harrison,2020-12-31,thefintechtimes,A Human Touch Will Be a Competitive Edge After...,https://thefintechtimes.com/53867-2/,Niels Pedersen is a Chartered Accountant and S...,finance,niels pedersen chartered accountant senior lec...,POSITIVE,NEUTRAL,POSITIVE,,
369043,,2020-12-31,marketscreener,Datametrex AI : Announces Deploying NexaSecuri...,https://www.marketscreener.com/quote/stock/DAT...,Datametrex AI Limited (TSXV: DM) (FSE: D4G) (O...,business,datametrex ai limited tsxv dm fse d4g otc pink...,POSITIVE,NEUTRAL,POSITIVE,,
369044,Polly Harrison,2020-12-31,thefintechtimes,"US Payments: Smart Pension, Episode Six, PAAY ...",https://thefintechtimes.com/us-payments-smart-...,"This December, The Fintech Times is asking ind...",finance,december fintech times asking industry leaders...,POSITIVE,NEUTRAL,POSITIVE,,
369045,,2020-12-31,marketscreener,"WESTWATER RESOURCES, INC. : Entry into a Mater...",https://www.marketscreener.com/quote/stock/WES...,Item 1.01Entry into a Material Definitive Agre...,business,item 101entry material definitive agreement de...,POSITIVE,NEUTRAL,POSITIVE,,


In [10]:
df_combined[df_combined['topic_id'] != ""]

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,distilbert,finbert,roberta,topic_id,topic_label
372,Sam Byford,2020-01-23,theverge,Huawei developer conference postponed due to W...,https://www.theverge.com/2020/1/23/21078258/hu...,Huawei has announced the postponement of a maj...,tech,huawei announced postponement major developers...,NEGATIVE,NEGATIVE,NEGATIVE,635,"[smartphone, semiconductor, supply_chain, auto..."
544,,2020-01-24,marketscreener,Today's Logistics Report: Cutting Rail Jobs; W...,https://www.marketscreener.com/UNION-PACIFIC-1...,"By Paul Page Sign up: With one click, get this...",business,paul page sign click newsletter delivered inbo...,POSITIVE,NEGATIVE,NEGATIVE,1481,"[shipping, logistics, supply_chain]"
879,Stan Schroeder,2020-01-28,mashable,Coronavirus might put a wrench in Apple's iPho...,https://mashable.com/article/iphone-coronavirus/,Apple may not be able to produce as many iPhon...,tech,apple able produce iphones planned coronavirus...,NEGATIVE,NEGATIVE,NEGATIVE,635,"[smartphone, semiconductor, supply_chain, auto..."
960,Bloomberg,2020-01-28,scmp,Apple supply chain braces for disruption from ...,https://www.scmp.com/tech/big-tech/article/304...,Apple’s China-centric manufacturing base is at...,general,apple chinacentric manufacturing base risk dis...,NEGATIVE,NEGATIVE,NEGATIVE,635,"[smartphone, semiconductor, supply_chain, auto..."
1005,Nick Statt,2020-01-28,theverge,iPhone maker Foxconn says coronavirus outbreak...,https://www.theverge.com/2020/1/28/21112288/co...,"Taiwanese electronics giant Foxconn, which man...",tech,taiwanese electronics giant foxconn manufactur...,NEGATIVE,NEUTRAL,POSITIVE,635,"[smartphone, semiconductor, supply_chain, auto..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
367797,,2020-12-29,marketscreener,Exclusive: Hedge fund Third Point urges Intel ...,https://www.marketscreener.com/quote/index/NAS...,"Were it to gain traction, Third Point's push f...",business,gain traction point push changes lead major sh...,POSITIVE,POSITIVE,POSITIVE,1314,"[semiconductor, tech_company, china, supply_ch..."
367943,nature,2020-12-29,nature,Coronavirus diaries: an unexpected career expe...,http://www.nature.com/articles/d41586-020-03627-0,"Credit: Adapted from Getty In March, every sin...",science,credit adapted getty march single aspect scien...,POSITIVE,NEUTRAL,POSITIVE,45,"[employees, wellbeing, remote_working]"
368074,Lila MacLellan,2020-12-29,qz,How to help working parents and not pit them a...,https://www.qz.com/work/1946450/how-to-help-wo...,Hit “play” on UNESCO’s interactive world map t...,tech,hit play unesco interactive world map tracing ...,POSITIVE,NEUTRAL,POSITIVE,45,"[employees, wellbeing, remote_working]"
368077,Sarah Todd,2020-12-29,qz,Covid-19 changed the way we talk about emotion...,https://www.qz.com/work/1950430/covid-19-chang...,One of my favorite things about work used to b...,tech,favorite things work gave chance compartmental...,POSITIVE,NEUTRAL,POSITIVE,45,"[employees, wellbeing, remote_working]"


In [6]:
sentiment_match = np.array(df_combined[['finbert', 'distilbert', 'roberta']])
sentiment_match = np.all(np.equal(sentiment_match, sentiment_match[:,:1]), axis=1)
df_combined['sentiment_match'] = sentiment_match

In [116]:
df_sent_match = df_combined[df_combined['sentiment_match'] == True]

In [150]:
idx = 400
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['title'])
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['distilbert'])
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['topic_label'])
print(df_sent_match[~df_sent_match['topic_label'].isna()].iloc[idx]['content'])

Burberry Closes US Warehouse After Workers Test Positive
NEGATIVE
['employees', 'amazon', 'warehouse', 'unsafe']
News & Analysis A total of three employees tested positive for the virus, and the facility will be closed indefinitely. VINELAND, New Jersey — The luxury brand Burberry is famed for plaid scarves and $1,800 trench coats. But for employees at its only warehouse in the US — operating as an “essential business” during the pandemic — daily standard issue in the time of Covid-19 is this: one surgical mask, one disinfectant wipe. For weeks, employees have been growing increasingly anxious about working conditions at the New Jersey facility, the British company’s gateway to the US market. This past weekend, they found out that a total of three workers had tested positive for the virus, according to a memo sent to employees and viewed by Bloomberg News. The facility will be shut indefinitely. The 300 employees at the warehouse in Vineland are among the many thousands of mostly low-p

In [140]:
#df_ner = pd.read_pickle('./data/df_ner_210717.pickle')

In [139]:
#df_ner