# Sentiment Analysis for Named Entities

In [1]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd

## SA model

In [2]:
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [3]:
SA_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
sa_tokenizer = AutoTokenizer.from_pretrained(SA_MODEL)
sa_config = AutoConfig.from_pretrained(SA_MODEL)
sa_model = AutoModelForSequenceClassification.from_pretrained(SA_MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def se_eval(text : str):
    encoded_input = sa_tokenizer(text, return_tensors='pt')
    output = sa_model(**encoded_input)
    return output[0][0]

def se_score(output : list):
    scores = output.detach().numpy()
    scores = softmax(scores)

    return {
        sa_config.id2label[i] : scores[i].item()
        for i in range(3)
    }

def sa_sentence(title: str):
    return se_score(se_eval(title))

In [5]:
sa_dict = {
    "negative": -1,
    "neutral": 0,
    "positive": 1
}

def get_sentiment(score: dict):
    return max(score, key=score.get)

def the_most(key: str, scores: list):
    st = max(enumerate(scores), key=lambda sa: sa[1][key])
    return {
        "sentence_number": st[0], 
        f"{key} score": st[1][key]
    }

## NER model

In [6]:
import flair, torch
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter
from flair.data import Sentence

In [7]:
torch.cuda.is_available()

True

In [8]:
!nvidia-smi

Wed Feb 22 18:36:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   54C    P8    N/A /  N/A |      6MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
flair.device = torch.device("cuda:0")

In [10]:
ner_splitter = SegtokSentenceSplitter()
# ner_tagger = SequenceTagger.load('ner')
ner_tagger = SequenceTagger.load('ner-fast')



2023-02-22 18:36:14,632 loading file /home/scurrra/.flair/models/ner-english-fast/4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611
2023-02-22 18:36:16,603 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [11]:
def split_news(news: dict):
    splitted = ner_splitter.split(news["content"])
    splitted.append(Sentence(news["title"]))
    if isinstance(news["description"], str):
        splitted.extend(
            ner_splitter.split(news["description"])
        )
    return splitted

## Data

In [12]:
import pymongo
from pprint import pprint

In [13]:
client = pymongo.MongoClient('localhost', 27017)
db = client["news"]
data = db["data"]

In [14]:
FULL_TEXT_SA_N_SAMPLES = 1

In [15]:
news = list(
    data.aggregate([
        {"$sample": {"size": FULL_TEXT_SA_N_SAMPLES}},  # take `FULL_TEXT_SA_N_SAMPLES` random samples
        {"$sort": {"index": 1}}                         # sort them by `index` field
    ])
)

[_["index"] for _ in news]

[21942]

In [22]:
# sample = list(
#     data.aggregate([
#         {"$sample": {"size": 1}}
#     ])
# )[0]

# sample = data.find_one({"index": 1984})

sample = news[0]

## Evaluating

In [23]:
stats = {
    "SAMPLES": [],
    "PER":  defaultdict(list),
    "LOC":  defaultdict(list),
    "ORG":  defaultdict(list),
    "MISC": defaultdict(list)
}

In [24]:
sample = news[0]
sample

{'_id': ObjectId('63e51459fb0fef334ccfa702'),
 'index': 21942,
 'source': 'CNN',
 'date': datetime.datetime(2014, 4, 11, 0, 0),
 'title': 'NATO chief urges Russia to pull its troops from Ukraine border - CNN',
 'category': 'news',
 'description': "NATO's chief urged Russia on Friday to pull back its troops from its border with Ukraine.",
 'content': 'Story highlightsNATO\'s secretary general says Russia should de-escalate border buildupNATO releases more photos that it says shows buildup to rebut Russia\'s denialsDestroyer USS Donald Cook enters the Black SeaRussia has no plans to annex southeastern Ukraine, the country\'s foreign minister saysNATO\'s chief urged Russia on Friday to pull back its troops from its border with Ukraine.Russia should contribute "to a de-escalation of the situation" and engage in a direct dialogue with the Ukrainian government, NATO Secretary General Anders Fogh Rasmussen said during a visit to Sofia, Bulgaria. NATO is "not discussing military actions" but i

In [25]:
def sa4ner_eval(stats: dict, news: dict):
    splitted = split_news(news)

    sa_stats = [sa_sentence(sentence.text) for sentence in splitted]
    ner_tagger.predict(splitted)

    sample_stats = {
        "PER":  defaultdict(list),
        "LOC":  defaultdict(list),
        "ORG":  defaultdict(list),
        "MISC": defaultdict(list)
    }     

    # ---
    # `sa`-statistic is added for analysis on full dataset
    # title
    title_sa = sa_dict[get_sentiment(sa_stats[-1])]
    for entity in splitted[len(splitted)-1].get_spans("ner"):
        # title is the `minus 1-st` sentence in the list of sentences
        sample_stats[entity.tag][entity.text].append(
            (-1, title_sa)
        )
    # content and description
    for sentence_i in range(len(splitted)-1):
        sentence_sa = sa_dict[get_sentiment(sa_stats[sentence_i])]
        for entity in splitted[sentence_i].get_spans("ner"):
            sample_stats[entity.tag][entity.text].append(
                (sentence_i, sentence_sa)
            )
    # ---

    # write stats for full dataset
    stats["SAMPLES"].append(news["index"])
    for key in sample_stats.keys():
        for entity in sample_stats[key].keys():
            stats[key][entity].append((news["index"], sample_stats[key][entity]))
    
    # but return stats for the sample
    return {
        "INDX": news["index"],
        "SENTENCES": splitted,
        "SA": sa_stats,
        "NER": sample_stats
    }

In [26]:
sample_stats = sa4ner_eval(
    stats,
    sample
)

In [28]:
sample_stats

{'INDX': 21942,
 'SENTENCES': [Sentence: "Story highlightsNATO 's secretary general says Russia should de-escalate border buildupNATO releases more photos that it says shows buildup to rebut Russia 's denialsDestroyer USS Donald Cook enters the Black SeaRussia has no plans to annex southeastern Ukraine , the country 's foreign minister saysNATO 's chief urged Russia on Friday to pull back its troops from its border with Ukraine.Russia should contribute " to a de-escalation of the situation " and engage in a direct dialogue with the Ukrainian government , NATO Secretary General Anders Fogh Rasmussen said during a visit to Sofia , Bulgaria ." → ["highlightsNATO"/ORG, "Russia"/LOC, "Russia"/LOC, "USS"/MISC, "Donald Cook"/PER, "Black SeaRussia"/LOC, "Ukraine"/LOC, "saysNATO"/ORG, "Russia"/LOC, "Ukraine.Russia"/LOC, "Ukrainian"/MISC, "NATO"/ORG, "Anders Fogh Rasmussen"/PER, "Sofia"/LOC, "Bulgaria"/LOC],
  Sentence: "NATO is " not discussing military actions " but is focused on protecting it

## Analysis