## Model

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from nltk.tokenize import sent_tokenize

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

from scipy.special import softmax

In [4]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [5]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "cpu"

In [6]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
import numpy as np
from collections import Counter

In [8]:
def se_eval(text : str):
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model(**encoded_input)
    return output[0][0].cpu()

def se_score(output : list):
    scores = output.detach().numpy()
    scores = softmax(scores)

    return {
        config.id2label[i] : scores[i].item()
        for i in range(3)
    }

In [9]:
def sa_title(title: str):
    return se_score(se_eval(title))

def sa_description(desc):
    if isinstance(desc, str):
        desc = sent_tokenize(desc)
        
        return [
            se_score(se_eval(sent))
            for sent in desc
        ]
    else: # np.NaN
        return [{
            config.id2label[i] : np.nan
            for i in range(3)
        }]

def sa_content_full(content: str):
    content = sent_tokenize(content)

    return [
        se_score(se_eval(sent))
        for sent in content
    ]

def sa_content(content: str):
    content = sent_tokenize(content)[:10]

    return [
        se_score(se_eval(sent))
        for sent in content
    ]

In [10]:
def get_sentiment(score: dict):
    return max(score, key=score.get)

def the_most(key: str, scores: list):
    st = max(enumerate(scores), key=lambda sa: sa[1][key])
    return {
        "sentence_number": st[0], 
        f"{key} score": st[1][key]
    }

def sa_report_full(new):
    title = sa_title(new["title"])
    description = sa_description(new["description"])
    content = sa_content_full(new["content"])

    report = {
        "title_sentiment": get_sentiment(title),
        "description_sentiment": Counter(get_sentiment(sa) for sa in description),
        "content_sentiment_full": Counter(get_sentiment(sa) for sa in content),
        "content_sentiment_f10": Counter(get_sentiment(sa) for sa in content[:10]),
        "most_positive_sentense": the_most("positive", content),
        "most_negative_sentense": the_most("negative", content)
    }

    return {
        "index": new["index"],
        "report": report,
        "details": {
            "title": title,
            "description": description,
            "content": content
        }
    }

def sa_report(new):
    title = sa_title(new["title"])
    description = sa_description(new["description"])
    content = sa_content(new["content"])

    report = {
        "title_sentiment": get_sentiment(title),
        "description_sentiment": Counter(get_sentiment(sa) for sa in description),
        "content_sentiment": Counter(get_sentiment(sa) for sa in content)
    }

    return {
        "index": new["index"],
        "report": report,
        "details": {
            "title": title,
            "description": description,
            "content": content
        }
    }

## Data

In [11]:
from pprint import pprint
import json
from google.colab import drive

In [12]:
drive.mount('/content/up_nlp/', force_remount=True)

Mounted at /content/up_nlp/


In [13]:
with open("/content/up_nlp/MyDrive/up_nlp/data.json", "r") as data_file:
    data = json.load(data_file)

len(data)

42635

## Evaluating

In [14]:
from tqdm import tqdm

In [15]:
N_SAMPLES = 12635

In [16]:
sa_first10_report_data = []
sa_first10_report_indices = []
sa_first10_report_failed = []

In [17]:
sa_first10_report_failed = sorted(list(set(sa_first10_report_failed)))

In [18]:
for i in tqdm(range(38336, len(data)), total=len(data)-38336):
    try:
        report = sa_report(data[i])
        sa_first10_report_data.append(report)
        sa_first10_report_indices.append(data[i]["index"])
    except:
        sa_first10_report_failed.append(data[i]["index"])
print("News failed: {}".format(sa_first10_report_failed))
print("News reported: {}".format(sa_first10_report_indices))

100%|██████████| 4299/4299 [3:08:06<00:00,  2.63s/it]

News failed: [38336, 38573, 38584, 39008, 39549, 39679, 39887, 40045, 40073, 40076, 40110, 40153, 40343, 40356, 40412, 40428, 40476, 40483, 40525, 40603, 40653, 40750, 40766, 40772, 41214, 41522, 41786]
News reported: [38337, 38338, 38339, 38340, 38341, 38342, 38343, 38344, 38345, 38346, 38347, 38348, 38349, 38350, 38351, 38352, 38353, 38354, 38355, 38356, 38357, 38358, 38359, 38360, 38361, 38362, 38363, 38364, 38365, 38366, 38367, 38368, 38369, 38370, 38371, 38372, 38373, 38374, 38375, 38376, 38377, 38378, 38379, 38380, 38381, 38382, 38383, 38384, 38385, 38386, 38387, 38388, 38389, 38390, 38391, 38392, 38393, 38394, 38395, 38396, 38397, 38398, 38399, 38400, 38401, 38402, 38403, 38404, 38405, 38406, 38407, 38408, 38409, 38410, 38411, 38412, 38413, 38414, 38415, 38416, 38417, 38418, 38419, 38420, 38421, 38422, 38423, 38424, 38425, 38426, 38427, 38428, 38429, 38430, 38431, 38432, 38433, 38434, 38435, 38436, 38437, 38438, 38439, 38440, 38441, 38442, 38443, 38444, 38445, 38446, 38447, 3844




In [19]:
sa_first10_report_failed

[38336,
 38573,
 38584,
 39008,
 39549,
 39679,
 39887,
 40045,
 40073,
 40076,
 40110,
 40153,
 40343,
 40356,
 40412,
 40428,
 40476,
 40483,
 40525,
 40603,
 40653,
 40750,
 40766,
 40772,
 41214,
 41522,
 41786]

In [21]:
with open("/content/up_nlp/MyDrive/up_nlp/sa_report_f10_2.json", "w") as report_file:
    report_file.write(json.dumps(sa_first10_report_data))

In [None]:
len(sa_first10_report_failed)

4364

In [20]:
len(sa_first10_report_indices)

4272

In [None]:
data[38336]

{'index': 38336,
 'source': 'CNN',
 'date': {'$date': {'$numberLong': '1602460800000'}},
 'title': 'Belarus allows police to use lethal weapons at mass protests - CNN',
 'category': 'news',
 'description': 'Belarus police can now use lethal weapons in the streets if needed, the interior ministry said on Monday, due to what it described as the radicalization of mass anti-government protests.',
 'content': 'MoscowBelarus police can now use lethal weapons in the streets if needed, the interior ministry said on Monday, due to what it described as the radicalization of mass anti-government protests.Security forces detained dozens of protesters on Sunday and used water cannon and batons to break up crowds demanding a new presidential election.Footage published by local news outlets showed police officers wearing black balaclavas dragging protesters into unmarked black vans and beating protesters with their batons at a rally that drew thousands onto the streets of the capital, Minsk.One seque