# Sentiment Analysis

Для сентимент-анализа новостей предлагается использовать [TimeLMs](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest). Эта модель представляет собой RoBERTa, тюнингованную на датасете твитов (124M твитов с января 2018 по декабрь 2021, возможно позже). Благодаря тому, что RoBERTa была обучена на датасетах за большой период времени, содержащих тексты разных стилей. Тюнинг на твитах даёт обновление модели, параллельно давая некоторую временную деградацию, что не должно повлиять на задачу сентимент-анализа.

## Data

In [1]:
import pandas as pd
import numpy as np
import pymongo
from pprint import pprint

In [2]:
client = pymongo.MongoClient('localhost', 27017)
db = client["news"]
data = db["data"]

## Model

In [3]:
from nltk.tokenize import sent_tokenize

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def se_eval(text : str):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    return output[0][0]

In [6]:
def se_score(output : list):
    scores = output.detach().numpy()
    scores = softmax(scores)

    return {
        config.id2label[i] : scores[i]
        for i in range(3)
    }

### Test on one of the news

In [7]:
# new that has description (4712)
new_test = data.find_one({"index": 4712})
new_test

{'_id': ObjectId('63e51459fb0fef334ccf63b4'),
 'index': 4712,
 'source': 'CNN',
 'date': datetime.datetime(2021, 7, 15, 0, 0),
 'title': "There's a shortage of truckers, but TuSimple thinks it has a solution: no driver needed - CNN",
 'category': 'news',
 'description': 'The e-commerce boom has exacerbated a global truck driver shortage, but could autonomous trucks help fix the problem?',
 'content': '(CNN)Right now, there\'s a shortage of truck drivers in the US and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple.Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US, China and Europe. Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world\'s largest truck manu

#### Title

In [8]:
def sa_title(title: str):
    return se_score(se_eval(title))

In [9]:
new_test["title"]

"There's a shortage of truckers, but TuSimple thinks it has a solution: no driver needed - CNN"

In [10]:
sa_title(new_test["title"])

{'negative': 0.2651931, 'neutral': 0.6773515, 'positive': 0.057455484}

#### Description

In [11]:
def sa_description(desc):
    if isinstance(desc, str):
        desc = sent_tokenize(desc)
        
        return [
            se_score(se_eval(sent))
            for sent in desc
        ]
    else: # np.NaN
        return [{
            config.id2label[i] : np.nan
            for i in range(3)
        }]

In [12]:
new_test["description"]

'The e-commerce boom has exacerbated a global truck driver shortage, but could autonomous trucks help fix the problem?'

In [13]:
sa_description(new_test["description"])

[{'negative': 0.30483902, 'neutral': 0.603849, 'positive': 0.091312096}]

#### Content

In [14]:
def sa_content_full(content: str):
    content = sent_tokenize(content)

    return [
        se_score(se_eval(sent))
        for sent in content
    ]

In [15]:
def sa_content(content: str):
    content = sent_tokenize(content)[:10]

    return [
        se_score(se_eval(sent))
        for sent in content
    ]

In [16]:
new_test["content"];

In [17]:
sa_c = sa_content_full(new_test["content"])

#### Pipeline for the new

In [18]:
from collections import Counter
from operator import itemgetter

In [19]:
def get_sentiment(score: dict):
    return max(score, key=score.get)

In [20]:
def the_most(key: str, scores: list):
    st = max(enumerate(scores), key=lambda sa: sa[1][key])
    return {
        "sentence_number": st[0], 
        f"{key} score": st[1][key]
    }

In [21]:
def sa_report_full(new):
    title = sa_title(new["title"])
    description = sa_description(new["description"])
    content = sa_content_full(new["content"])

    report = {
        "title_sentiment": get_sentiment(title),
        "description_sentiment": Counter(get_sentiment(sa) for sa in description),
        "content_sentiment_full": Counter(get_sentiment(sa) for sa in content),
        "content_sentiment_f10": Counter(get_sentiment(sa) for sa in content[:10]),
        "most_positive_sentense": the_most("positive", content),
        "most_negative_sentense": the_most("negative", content)
    }

    return {
        "index": new["index"],
        "report": report,
        "details": {
            "title": title,
            "description": description,
            "content": content
        }
    }

In [22]:
def sa_report(new):
    title = sa_title(new["title"])
    description = sa_description(new["description"])
    content = sa_content(new["content"])

    report = {
        "title_sentiment": get_sentiment(title),
        "description_sentiment": Counter(get_sentiment(sa) for sa in description),
        "content_sentiment": Counter(get_sentiment(sa) for sa in content)
    }

    return {
        "index": new["index"],
        "report": report,
        "details": {
            "title": title,
            "description": description,
            "content": content
        }
    }

In [23]:
sa_report(new_test)

{'index': 4712,
 'report': {'title_sentiment': 'neutral',
  'description_sentiment': Counter({'neutral': 1}),
  'content_sentiment': Counter({'negative': 1, 'neutral': 6, 'positive': 3})},
 'details': {'title': {'negative': 0.2651931,
   'neutral': 0.6773515,
   'positive': 0.057455484},
  'description': [{'negative': 0.30483902,
    'neutral': 0.603849,
    'positive': 0.091312096}],
  'content': [{'negative': 0.68369293,
    'neutral': 0.29760292,
    'positive': 0.018704118},
   {'negative': 0.03147675, 'neutral': 0.6021249, 'positive': 0.3663984},
   {'negative': 0.0059518823, 'neutral': 0.75421983, 'positive': 0.23982815},
   {'negative': 0.00952431, 'neutral': 0.60080636, 'positive': 0.38966927},
   {'negative': 0.005265171, 'neutral': 0.50819606, 'positive': 0.48653883},
   {'negative': 0.007141564, 'neutral': 0.9138311, 'positive': 0.07902722},
   {'negative': 0.008209709, 'neutral': 0.480543, 'positive': 0.51124734},
   {'negative': 0.0072282725, 'neutral': 0.7601931, 'positiv

In [24]:
sa_report_full(new_test)

{'index': 4712,
 'report': {'title_sentiment': 'neutral',
  'description_sentiment': Counter({'neutral': 1}),
  'content_sentiment_full': Counter({'negative': 4,
           'neutral': 60,
           'positive': 14}),
  'content_sentiment_f10': Counter({'negative': 1,
           'neutral': 6,
           'positive': 3}),
  'most_positive_sentense': {'sentence_number': 9,
   'positive score': 0.9015398},
  'most_negative_sentense': {'sentence_number': 45,
   'negative score': 0.82311386}},
 'details': {'title': {'negative': 0.2651931,
   'neutral': 0.6773515,
   'positive': 0.057455484},
  'description': [{'negative': 0.30483902,
    'neutral': 0.603849,
    'positive': 0.091312096}],
  'content': [{'negative': 0.68369293,
    'neutral': 0.29760292,
    'positive': 0.018704118},
   {'negative': 0.03147675, 'neutral': 0.6021249, 'positive': 0.3663984},
   {'negative': 0.0059518823, 'neutral': 0.75421983, 'positive': 0.23982815},
   {'negative': 0.00952431, 'neutral': 0.60080636, 'positive'