# Sentiment Analysis

Для сентимент-анализа новостей предлагается использовать [TimeLMs](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest). Эта модель представляет собой RoBERTa, тюнингованную на датасете твитов (124M твитов с января 2018 по декабрь 2021, возможно позже). Благодаря тому, что RoBERTa была обучена на датасетах за большой период времени, содержащих тексты разных стилей. Тюнинг на твитах даёт обновление модели, параллельно давая некоторую временную деградацию, что не должно повлиять на задачу сентимент-анализа.

## Data

In [1]:
import pandas as pd
import numpy as np
import pymongo
from pprint import pprint

In [2]:
client = pymongo.MongoClient('localhost', 27017)
db = client["news"]
data = db["data"]

## Model

In [3]:
from nltk.tokenize import sent_tokenize

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def se_eval(text : str):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    return output[0][0]

In [6]:
def se_score(output : list):
    scores = output.detach().numpy()
    scores = softmax(scores)

    return {
        config.id2label[i] : scores[i].item()
        for i in range(3)
    }

### Test on one of the news

In [7]:
# new that has description (4712)
new_test = data.find_one({"index": 4712})
new_test

{'_id': ObjectId('63e51459fb0fef334ccf63b4'),
 'index': 4712,
 'source': 'CNN',
 'date': datetime.datetime(2021, 7, 15, 0, 0),
 'title': "There's a shortage of truckers, but TuSimple thinks it has a solution: no driver needed - CNN",
 'category': 'news',
 'description': 'The e-commerce boom has exacerbated a global truck driver shortage, but could autonomous trucks help fix the problem?',
 'content': '(CNN)Right now, there\'s a shortage of truck drivers in the US and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple.Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US, China and Europe. Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world\'s largest truck manu

#### Title

In [7]:
def sa_title(title: str):
    return se_score(se_eval(title))

In [9]:
new_test["title"]

"There's a shortage of truckers, but TuSimple thinks it has a solution: no driver needed - CNN"

In [10]:
sa_title(new_test["title"])

{'negative': 0.2651931047439575,
 'neutral': 0.6773514747619629,
 'positive': 0.057455483824014664}

#### Description

In [8]:
def sa_description(desc):
    if isinstance(desc, str):
        desc = sent_tokenize(desc)
        
        return [
            se_score(se_eval(sent))
            for sent in desc
        ]
    else: # np.NaN
        return [{
            config.id2label[i] : np.nan
            for i in range(3)
        }]

In [12]:
new_test["description"]

'The e-commerce boom has exacerbated a global truck driver shortage, but could autonomous trucks help fix the problem?'

In [13]:
sa_description(new_test["description"])

[{'negative': 0.30483901500701904,
  'neutral': 0.6038489937782288,
  'positive': 0.09131209552288055}]

#### Content

In [9]:
def sa_content_full(content: str):
    content = sent_tokenize(content)

    return [
        se_score(se_eval(sent))
        for sent in content
    ]

In [10]:
def sa_content(content: str):
    content = sent_tokenize(content)[:10]

    return [
        se_score(se_eval(sent))
        for sent in content
    ]

In [16]:
new_test["content"];

In [17]:
sa_c = sa_content_full(new_test["content"])

#### Pipeline for the new

In [11]:
from collections import Counter
from operator import itemgetter

In [12]:
def get_sentiment(score: dict):
    return max(score, key=score.get)

In [13]:
def the_most(key: str, scores: list):
    st = max(enumerate(scores), key=lambda sa: sa[1][key])
    return {
        "sentence_number": st[0], 
        f"{key} score": st[1][key]
    }

In [14]:
def sa_report_full(new):
    title = sa_title(new["title"])
    description = sa_description(new["description"])
    content = sa_content_full(new["content"])

    report = {
        "title_sentiment": get_sentiment(title),
        "description_sentiment": Counter(get_sentiment(sa) for sa in description),
        "content_sentiment_full": Counter(get_sentiment(sa) for sa in content),
        "content_sentiment_f10": Counter(get_sentiment(sa) for sa in content[:10]),
        "most_positive_sentense": the_most("positive", content),
        "most_negative_sentense": the_most("negative", content)
    }

    return {
        "index": new["index"],
        "report": report,
        "details": {
            "title": title,
            "description": description,
            "content": content
        }
    }

In [15]:
def sa_report(new):
    title = sa_title(new["title"])
    description = sa_description(new["description"])
    content = sa_content(new["content"])

    report = {
        "title_sentiment": get_sentiment(title),
        "description_sentiment": Counter(get_sentiment(sa) for sa in description),
        "content_sentiment": Counter(get_sentiment(sa) for sa in content)
    }

    return {
        "index": new["index"],
        "report": report,
        "details": {
            "title": title,
            "description": description,
            "content": content
        }
    }

In [23]:
sa_report(new_test)

{'index': 4712,
 'report': {'title_sentiment': 'neutral',
  'description_sentiment': Counter({'neutral': 1}),
  'content_sentiment': Counter({'negative': 1, 'neutral': 6, 'positive': 3})},
 'details': {'title': {'negative': 0.2651931047439575,
   'neutral': 0.6773514747619629,
   'positive': 0.057455483824014664},
  'description': [{'negative': 0.30483901500701904,
    'neutral': 0.6038489937782288,
    'positive': 0.09131209552288055}],
  'content': [{'negative': 0.6836929321289062,
    'neutral': 0.29760292172431946,
    'positive': 0.018704118207097054},
   {'negative': 0.03147675096988678,
    'neutral': 0.6021249294281006,
    'positive': 0.3663983941078186},
   {'negative': 0.005951882340013981,
    'neutral': 0.7542198300361633,
    'positive': 0.23982815444469452},
   {'negative': 0.009524310007691383,
    'neutral': 0.6008063554763794,
    'positive': 0.389669269323349},
   {'negative': 0.0052651711739599705,
    'neutral': 0.5081960558891296,
    'positive': 0.486538827419281

In [24]:
sa_report_full(new_test)

{'index': 4712,
 'report': {'title_sentiment': 'neutral',
  'description_sentiment': Counter({'neutral': 1}),
  'content_sentiment_full': Counter({'negative': 4,
           'neutral': 60,
           'positive': 14}),
  'content_sentiment_f10': Counter({'negative': 1,
           'neutral': 6,
           'positive': 3}),
  'most_positive_sentense': {'sentence_number': 9,
   'positive score': 0.9015398025512695},
  'most_negative_sentense': {'sentence_number': 45,
   'negative score': 0.8231138586997986}},
 'details': {'title': {'negative': 0.2651931047439575,
   'neutral': 0.6773514747619629,
   'positive': 0.057455483824014664},
  'description': [{'negative': 0.30483901500701904,
    'neutral': 0.6038489937782288,
    'positive': 0.09131209552288055}],
  'content': [{'negative': 0.6836929321289062,
    'neutral': 0.29760292172431946,
    'positive': 0.018704118207097054},
   {'negative': 0.03147675096988678,
    'neutral': 0.6021249294281006,
    'positive': 0.3663983941078186},
   {'ne

## Evaluating

In [16]:
from tqdm import tqdm

### Full text

In [26]:
FULL_TEXT_SA_N_SAMPLES = 100

In [27]:
sa_full = db["sa_full"]

In [28]:
sa_full_report_data = []
sa_full_report_indices = []
sa_full_report_failed = []
for new in tqdm(data.aggregate([{"$sample": {"size": FULL_TEXT_SA_N_SAMPLES}}]), total=FULL_TEXT_SA_N_SAMPLES):
    try:
        report = sa_report_full(new)
        sa_full_report_data.append(report)
        sa_full_report_indices.append(new["index"])
    except RuntimeError:
        sa_full_report_failed.append(new["index"])
print("News failed: {}".format(sa_full_report_failed))
print("News reported: {}".format(sa_full_report_indices))

100%|██████████| 100/100 [05:15<00:00,  3.15s/it]

News faild: [21744, 17840, 7281]
News reported: [29414, 4820, 9094, 11401, 36626, 17819, 16669, 5312, 5346, 20062, 32465, 6550, 27956, 29595, 22301, 24623, 9259, 29425, 36863, 9292, 24153, 2520, 6662, 14245, 39356, 8482, 23585, 3360, 10790, 30223, 9997, 38124, 11708, 14211, 42525, 22383, 38975, 14444, 29438, 39222, 37516, 8029, 26606, 17396, 12658, 27818, 19484, 18200, 30764, 14534, 32857, 38530, 22327, 32351, 33321, 6589, 17578, 22902, 7214, 7368, 21724, 30647, 41448, 10178, 17765, 36117, 12506, 38183, 34449, 42591, 25902, 30698, 33197, 26557, 33074, 27396, 13433, 29753, 30787, 29740, 20585, 12801, 24916, 39635, 17088, 27065, 42539, 16187, 39318, 38824, 9139, 11232, 17624, 42341, 19544, 25320, 24559]





In [29]:
sa_full.insert_many(sa_full_report_data)

<pymongo.results.InsertManyResult at 0x7f520bc01700>

### First 10 sentences

In [17]:
F10_TEXT_SA_N_SAMPLES = db.command("collstats", "data")["count"]
F10_TEXT_SA_N_SAMPLES

42635

In [29]:
F10_TEXT_SA_N_SAMPLES_MAX = 10000 #F10_TEXT_SA_N_SAMPLES

In [19]:
sa_first10 = db["sa_first10"]

In [30]:
F10_TEXT_SA_IND_LAST = 9000 #[sa["index"] for sa in sa_first10.find()][-1] + 1 #0
F10_TEXT_SA_IND_LAST

9000

In [31]:
sa_first10_report_data = []
sa_first10_report_indices = []
sa_first10_report_failed = []
for new in tqdm(data.find({"index": {"$gte": F10_TEXT_SA_IND_LAST, "$lt": F10_TEXT_SA_N_SAMPLES_MAX}}), total=F10_TEXT_SA_N_SAMPLES_MAX-F10_TEXT_SA_IND_LAST):
    try:
        report = sa_report(new)
        sa_first10_report_data.append(report)
        sa_first10_report_indices.append(new["index"])
    except:
        sa_first10_report_failed.append(new["index"])
print("News failed: {}".format(sa_first10_report_failed))
print("News reported: {}".format(sa_first10_report_indices))

100%|██████████| 1000/1000 [15:33<00:00,  1.07it/s]

News failed: []
News reported: [9000, 9001, 9002, 9003, 9004, 9005, 9006, 9007, 9008, 9009, 9010, 9011, 9012, 9013, 9014, 9015, 9016, 9017, 9018, 9019, 9020, 9021, 9022, 9023, 9024, 9025, 9026, 9027, 9028, 9029, 9030, 9031, 9032, 9033, 9034, 9035, 9036, 9037, 9038, 9039, 9040, 9041, 9042, 9043, 9044, 9045, 9046, 9047, 9048, 9049, 9050, 9051, 9052, 9053, 9054, 9055, 9056, 9057, 9058, 9059, 9060, 9061, 9062, 9063, 9064, 9065, 9066, 9067, 9068, 9069, 9070, 9071, 9072, 9073, 9074, 9075, 9076, 9077, 9078, 9079, 9080, 9081, 9082, 9083, 9084, 9085, 9086, 9087, 9088, 9089, 9090, 9091, 9092, 9093, 9094, 9095, 9096, 9097, 9098, 9099, 9100, 9101, 9102, 9103, 9104, 9105, 9106, 9107, 9108, 9109, 9110, 9111, 9112, 9113, 9114, 9115, 9116, 9117, 9118, 9119, 9120, 9121, 9122, 9123, 9124, 9125, 9126, 9127, 9128, 9129, 9130, 9131, 9132, 9133, 9134, 9135, 9136, 9137, 9138, 9139, 9140, 9141, 9142, 9143, 9144, 9145, 9146, 9147, 9148, 9149, 9150, 9151, 9152, 9153, 9154, 9155, 9156, 9157, 9158, 9159, 9160, 91




In [32]:
sa_first10.insert_many(sa_first10_report_data)

<pymongo.results.InsertManyResult at 0x7f57474bdb40>

In [33]:
sa_first10_report_indices[-1]

9999

## Report

Сентимент-анализ проводился по заголовку, краткому описанию (где оно было), а также по самой статье, разбитым на предложения, и первым 10 её предложениям. 

In [1]:
from collections import OrderedDict
import pandas as pd
import numpy as np
import pymongo
from pprint import pprint
from nltk.tokenize import sent_tokenize

In [2]:
client = pymongo.MongoClient('localhost', 27017)
db = client["news"]

data = db["data"]
sa_report = {
    "full": db["sa_full"],
    "first10": db["sa_first10"]
}

In [3]:
sa_indices = {
    "full": sorted([sa["index"] for sa in sa_report["full"].find()]),
    "first10": sorted([sa["index"] for sa in sa_report["first10"].find()])
}

In [4]:
def report(index: int, kind="full"):
    sa = sa_report[kind].find_one({"index": index})
    if sa == None:
        raise Exception(f"No sentiment analysis for {index} of kind {kind}")
    new = data.find_one({"index": index})

    result = OrderedDict({
        "index": index,
        "category": new["category"],
        "title": OrderedDict({
            "text": new["title"],
            "sentiment": sa["report"]["title_sentiment"],
            "details": sa["details"]["title"]
        })
    })

    if not isinstance(new["description"], float):
        result["description"] = OrderedDict({
            "sentiment": sa["report"]["description_sentiment"],
            "text": new["description"],
            "details": sa["details"]["description"]
        })

    if kind == "full":
        result["content"] = OrderedDict({
            "sentiment": {
                "full": sa["report"]["content_sentiment_full"],
                "first10": sa["report"]["content_sentiment_f10"]
            },
            "most_positive_sentense": sa["report"]["most_positive_sentense"],
            "most_negative_sentense": sa["report"]["most_negative_sentense"],
            "text": new["content"],
        })
    elif kind == "first10":
        result["content"] = OrderedDict({
            "sentiment": sa["report"]["content_sentiment"],
            "text": new["content"],
        })
    
    return result

In [5]:
pprint(report(11401))

OrderedDict([('index', 11401),
             ('category', 'politics'),
             ('title',
              OrderedDict([('text',
                            'Trump announces new intel chief pick; No. 2 '
                            'intelligence official is leaving administration - '
                            'CNNPolitics'),
                           ('sentiment', 'neutral'),
                           ('details',
                            {'negative': 0.07668529450893402,
                             'neutral': 0.8802786469459534,
                             'positive': 0.043035950511693954})])),
             ('description',
              OrderedDict([('sentiment', {'neutral': 1}),
                           ('text',
                            'President Donald Trump announced Thursday night '
                            'that Joseph Maguire, the leader of the National '
                            'Counterterrorism Center, is his new pick to be '
                            't