In [9]:
import torch
from transformers import XLMTokenizer, RobertaModel
from herbert_emotion_classifier.model import HerbertEmotionClassifier
from herbert_emotion_classifier.seq_model import HerbertEmotionSequenceClassifier

In [3]:
tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
bert_model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

In [21]:
state_dict_path= "dense_model.pth"

# classification_model = HerbertEmotionClassifier()
# classification_model.load_state_dict(torch.load(PATH), strict=False)
# classification_model = model.eval()

In [10]:
PATH= "wust_seq_model.pth"

new_model = HerbertEmotionSequenceClassifier()
new_model.load_state_dict(torch.load(PATH), strict=False)
new_model = new_model.eval()

In [16]:
text = "Jest w pytę"

with torch.no_grad():
    encoded_input = tokenizer.encode(text, return_tensors='pt').to("cpu")
    outputs = bert_model(encoded_input)
    sentence_embedding = outputs[1].squeeze(dim=0)
sentence_embedding.size()

torch.Size([768])

In [40]:
emotion_dict = {
    0: "oczekiwanie",
    1: "podziw",
    2: "radosc",
    3: "smutek",
    4: "strach",
    5: "wstret",
    6: "zaskoczenie",
    7: "zlosc",
    8: "neutralny",
}

In [37]:
from torch.nn import Softmax

softmax = Softmax(dim=1)

# def predict_emotion(embeddng: torch.tensor):
#     predictions = classification_model(embeddng.unsqueeze(0))
#     predictions = softmax(predictions).squeeze(0)
#     predictions = predictions.tolist()
    
#     predicted_emotions = {}
    
#     for pred, emotion in zip(predictions, emotion_dict.values()):
#         predicted_emotions[emotion] = pred
    
#     return predicted_emotions
    
# predict_emotion(sentence_embedding)

In [29]:
from typing import List, Dict

DEVICE = "cpu"


def merge(sequences: List[torch.tensor]):
        lengths = [len(seq) for seq in sequences]
        vector_size=768
        padded_seqs = torch.zeros(len(sequences), max(lengths), vector_size)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, (max(lengths) - end):] = seq[:end]
        return padded_seqs

@torch.no_grad()
def get_embedding_for_list_of_texts(list_of_texts: List[str]) -> (torch.tensor, torch.tensor):
    """
    For a given list of sentences the function return embedding generated by BERT
    :param text: Sentence for which u want to get an embedding
    :return: (tensor of embeddings for each token in sentneces, average embedding of a sentences)
    """
    tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
    bert_model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

    list_of_sentence_embeddings = []
    list_of_sequence_embeddings = []

    for text in list_of_texts:
        encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        outputs = bert_model(encoded_input)

        sequence_tokens_embedding = outputs[0].squeeze(dim=0)
        sentence_embedding = outputs[1].squeeze(dim=0)

        list_of_sequence_embeddings.append(sequence_tokens_embedding)
        list_of_sentence_embeddings.append(sentence_embedding)
        
    seq_embeddings_tensor = merge(list_of_sequence_embeddings)
    sentence_embeddings_tensor = torch.stack(list_of_sentence_embeddings, dim=0)

    return seq_embeddings_tensor, sentence_embeddings_tensor

In [30]:
list_of_texts = ["Jebać pis", "Kurwa jest w pytę !!!! serio jest zajebiscie", "Kurwa jest w pytę !!!!", "Kurwa jest w pytę !!!!"]

seq_embeddings_tensor, sequence_embeddings_tensor = get_embedding_for_list_of_texts(list_of_texts)

In [31]:
new_model(seq_embeddings_tensor).size()

torch.Size([4, 9])

In [55]:
def predict_emotion(seq_embeddings_tensor: torch.tensor) -> Dict:
    """
    Function applies emotion classification mode to the given embedding
    :param sentence_embedding: embedding averaged over all tokens, of size (N, 768)
    :return: list of dictionaries with a probability distribution over emotions [{"radość: 0.2137, smutek:0.01 ....}, ..] of len N
    """
    classification_model = HerbertEmotionClassifier()
    classification_model.load_state_dict(torch.load(state_dict_path), strict=False)
    classification_model = classification_model.eval().to(DEVICE)

    softmax = Softmax(dim=1)

    predictions = classification_model(sentence_embeddings_tensor)
    predictions = softmax(predictions).detach()
    
    list_of_predicted_emotions = []
    
    for pred in predictions:
        predicted_emotions = {}
        
        for label_num, emotion in zip(pred.tolist(), emotion_dict.values()): 
            predicted_emotions[emotion] = label_num
            
        list_of_predicted_emotions.append(predicted_emotions)
    
    return list_of_predicted_emotions


In [41]:
state_dict_path= "wust_seq_model.pth"

def predict_emotions_with_seq_model(sentence_embeddings_tensor: torch.tensor) -> Dict:
    """
    Function applies emotion classification mode to the given embedding
    :param sentence_embedding: embedding averaged over all tokens, of size (N, 768)
    :return: list of dictionaries with a probability distribution over emotions [{"radość: 0.2137, smutek:0.01 ....}, ..] of len N
    """
    seq_classification_model = HerbertEmotionSequenceClassifier()
    seq_classification_model.load_state_dict(torch.load(state_dict_path), strict=False)
    seq_classification_model = seq_classification_model.eval().to(DEVICE)

    softmax = Softmax(dim=1)

    predictions = seq_classification_model(sentence_embeddings_tensor)
    predictions = softmax(predictions).detach()

    list_of_predicted_emotions = []

    for pred in predictions:
        predicted_emotions = {}

        for label_num, emotion in zip(pred.tolist(), emotion_dict.values()):
            predicted_emotions[emotion] = label_num

        list_of_predicted_emotions.append(predicted_emotions)

    return list_of_predicted_emotions

In [42]:
predictions = predict_emotions_with_seq_model(seq_embeddings_tensor)

In [43]:
for pred in predictions:
    print(pred)

{'oczekiwanie': 0.014855558052659035, 'podziw': 0.002206078264862299, 'radosc': 0.0004334972763899714, 'smutek': 0.015141263604164124, 'strach': 1.0492610272194725e-05, 'wstret': 0.19942021369934082, 'zaskoczenie': 0.00010597382788546383, 'zlosc': 0.7676884531974792, 'neutralny': 0.0001384594797855243}
{'oczekiwanie': 0.06689931452274323, 'podziw': 0.056297726929187775, 'radosc': 0.018150344491004944, 'smutek': 0.13493552803993225, 'strach': 0.00028694135835394263, 'wstret': 0.10945132374763489, 'zaskoczenie': 0.0015101212775334716, 'zlosc': 0.6113536953926086, 'neutralny': 0.0011150126811116934}
{'oczekiwanie': 0.053864799439907074, 'podziw': 0.0026839866768568754, 'radosc': 0.0010273511288687587, 'smutek': 0.14232219755649567, 'strach': 0.0003101979673374444, 'wstret': 0.08498845249414444, 'zaskoczenie': 0.0005375996115617454, 'zlosc': 0.7137088775634766, 'neutralny': 0.0005565434694290161}
{'oczekiwanie': 0.053864799439907074, 'podziw': 0.0026839866768568754, 'radosc': 0.00102735112