In [1]:
import pandas as pd
from typing import Dict, List
from itertools import starmap
import re


In [2]:
df = pd.read_csv("szczepionka_since_2020-12-01.csv")[["created_at", "date", "tweet"]]

In [3]:
import torch
from os.path import join, dirname, realpath
from transformers import XLMTokenizer, RobertaModel
from typing import List

In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


def merge(sequences: List[torch.tensor]):
    """
    Given the list of sequences embeddings where every seq has a different num of tokens the method converts
    the list into a tensor of size (N, L, 768) where N is the num of sequences in list and L is the longest sequence
    :param sequences: List of tensors of embeddings for tokens in sentences
    :return: tensor of size (N, L, 768)
    """
    lengths = [len(seq) for seq in sequences]
    vector_size = 768
    padded_seqs = torch.zeros(len(sequences), max(lengths), vector_size)
    for i, seq in enumerate(sequences):
        end = lengths[i]
        padded_seqs[i, (max(lengths) - end) :] = seq[:end]
    return padded_seqs


@torch.no_grad()
def get_embedding_for_list_of_texts(
    list_of_texts: List[str],
) -> (torch.tensor, torch.tensor):
    """
    For a given list of sentences the function return embedding generated by BERT
    :param text: Sentence for which u want to get an embedding
    :return: (tensor of embeddings for each token in sentneces, average embedding of a sentences)
    """
    tokenizer = XLMTokenizer.from_pretrained('allegro/herbert-klej-cased-tokenizer-v1')
    bert_model = RobertaModel.from_pretrained('allegro/herbert-klej-cased-v1').to(DEVICE)
    
    list_of_sentence_embeddings = []
    list_of_sequence_embeddings = []

    for text in list_of_texts:
        encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        outputs = bert_model(encoded_input)

        sequence_tokens_embedding = outputs[0].squeeze(dim=0)
        sentence_embedding = outputs[1].squeeze(dim=0)

        list_of_sequence_embeddings.append(sequence_tokens_embedding)
        list_of_sentence_embeddings.append(sentence_embedding)

    seq_embeddings_tensor = merge(list_of_sequence_embeddings)
    sentence_embeddings_tensor = torch.stack(list_of_sentence_embeddings, dim=0)

    return seq_embeddings_tensor, sentence_embeddings_tensor


In [5]:
def _replace_emotes_with_text(
        text: str,
        emote_to_text: Dict[str, str]
    ) -> str:
        for emote, emote_text in emote_to_text.items():
            text = text.replace(emote, emote_text)

        return text

def _remove_urls_from_text(text: str) -> str:
    text = re.sub(r'\S+://\S+', '', text, flags=re.MULTILINE)
    return text

In [6]:
from pathlib import Path
import json
import pickle
from pathlib import Path
import pandas as pd
from os.path import join


EMOTE_TO_TEXT_PATH = Path('../../data/emote_to_text.json')

with EMOTE_TO_TEXT_PATH.open(encoding='utf8') as file:
    emote_to_text = json.load(file)

In [7]:
texts = starmap(
            _replace_emotes_with_text,
            zip(df.tweet.tolist(), [emote_to_text] * len(df))
        )

texts = map(_remove_urls_from_text, texts)

texts = list(texts)



df["processed_tweets"] = texts

In [8]:
seq_embeddings_tensor, sentence_embeddings_tensor = get_embedding_for_list_of_texts(df.processed_tweets)

In [9]:
from seq_model import HerbertEmotionSequenceClassifier
from torch.nn import Softmax

In [10]:
classification_model = HerbertEmotionSequenceClassifier()

state_dict_path = "../wust_seq_model.pth"

classification_model.load_state_dict(torch.load(state_dict_path), strict=False)
classification_model = classification_model.eval().to(DEVICE)

softmax = Softmax(dim=1)

In [11]:
torch.cuda.empty_cache()

In [34]:
with torch.no_grad():
    predictions_1 = softmax(classification_model(seq_embeddings_tensor[:3000]).to(DEVICE)).cpu().numpy()
    predictions_2 = softmax(classification_model(seq_embeddings_tensor[3000:]).to(DEVICE)).cpu().numpy()


RuntimeError: Input and parameter tensors are not at the same device, found input tensor at cpu and parameter tensor at cuda:0

In [18]:
import numpy as np

predictions = np.concatenate((predictions_1, predictions_2), axis=0)


In [19]:
predictions.shape

(6624, 9)

In [20]:
emotion_dict = {
    "oczekiwanie": 0,
    "podziw": 1,
    "radosc": 2,
    "smutek": 3,
    "strach": 4,
    "wstret": 5,
    "zaskoczenie": 6,
    "zlosc": 7,
    "neutralny": 8,
    "neutralne": 8,
}


In [25]:
emotion_list = {}

for emotion, column in emotion_dict.items():
    emotion_list[emotion] = predictions[:, column].tolist()

In [30]:
# emotion_list

In [32]:
for emotion in emotion_dict.keys():
    df[emotion] = emotion_list[emotion]

In [33]:
df

Unnamed: 0,created_at,date,tweet,processed_tweets,oczekiwanie,podziw,radosc,smutek,strach,wstret,zaskoczenie,zlosc,neutralny,neutralne
0,2021-01-21 17:14:32 CET,2021-01-21,Jakie skutki niepożądane zaobserwowano w czasi...,Jakie skutki niepożądane zaobserwowano w czasi...,-0.078405,-2.752152,-7.049420,-5.944355,3.317139,-4.097007,-3.332653,-2.177593,4.081343,4.081343
1,2021-01-21 16:55:48 CET,2021-01-21,#SzczepimySię #COVID19 #Covid_19 #koronawirusp...,#SzczepimySię #COVID19 #Covid_19 #koronawirusp...,0.247204,-4.286077,-5.076561,-2.129335,0.056217,-2.789803,-6.996193,-3.944554,6.387649,6.387649
2,2021-01-21 16:22:05 CET,2021-01-21,(3/3) Po wypowiedzi posła Lewicy opublikowany...,(3/3) Po wypowiedzi posła Lewicy opublikowany...,0.969343,-4.663278,-6.611484,-2.024936,-0.959466,2.139658,-4.985923,2.744113,2.261405,2.261405
3,2021-01-21 16:22:04 CET,2021-01-21,(2/3) O deklaracji chęci zaszczepienia na COV...,(2/3) O deklaracji chęci zaszczepienia na COV...,0.872439,-4.842559,-7.198940,-2.226286,-0.684579,2.450838,-4.415974,3.407382,1.715589,1.715589
4,2021-01-21 16:22:02 CET,2021-01-21,(1/3) Zgodnie z wynikami sondażu United Surve...,(1/3) Zgodnie z wynikami sondażu United Surve...,1.059799,-3.817860,-5.587823,-3.458345,-0.424636,-2.024174,-7.031165,-1.749990,5.740261,5.740261
5,2021-01-21 16:12:42 CET,2021-01-21,"Rozmawiają dwa szczury: - Heniu, a Ty się zasz...","Rozmawiają dwa szczury: - Heniu, a Ty się zasz...",-0.354499,-4.014688,-4.824597,-1.360549,1.778537,-1.828506,0.107076,-1.225100,0.827278,0.827278
6,2021-01-21 16:09:07 CET,2021-01-21,(Jeszcze) nie zamknęli nas w obozach. #COVID19...,(Jeszcze) nie zamknęli nas w obozach. #COVID19...,0.459875,-4.181236,-4.210942,-0.682834,0.019458,-0.865167,-3.210000,-1.002657,2.639622,2.639622
7,2021-01-21 16:01:00 CET,2021-01-21,Do końca pierwszego kwartału 2021 roku #BioNTe...,Do końca pierwszego kwartału 2021 roku #BioNTe...,1.795166,-1.964428,-3.483556,-4.148592,-0.535214,-4.553160,-5.494053,-2.749705,4.583469,4.583469
8,2021-01-21 15:43:33 CET,2021-01-21,Już ponad 600 tys. osób podpisało apel @Yunus_...,Już ponad 600 tys. osób podpisało apel @Yunus_...,2.281098,1.546575,-1.936649,-6.320982,-2.117420,-4.783503,-4.909160,-0.558561,1.725296,1.725296
9,2021-01-21 15:43:21 CET,2021-01-21,Chciał bym przypomnieć że #Polacy grają z #Uru...,Chciał bym przypomnieć że #Polacy grają z #Uru...,2.284878,-4.644465,0.090779,1.531593,-5.123172,-3.253448,-4.503701,-3.311231,2.330388,2.330388
