### Classifier
- similarity score 1 [0,0.5]: diary textual embedding - lyrics embedding
- similarity score 2 [0,0.5]: s2 main [0,0.375] + s2 sub [0, 0.125]
- s2 main [0,0.375]: diary emotion embedding - lyrics emotion embedding
- s2 sub [0, 0.125]: music feature label (0,1,2) = query emotion score (0,1,2)
- total score = similarity score 1 [0,0.5] + similarity score 2 [0,0.5]

<수정할 사항>
- encoder 추가: textual encoder
- pinecone db 3개: lyrics textual embedding / lyrics emotional embedding / music feature scores
- s2 수정: compute s2, compute ss, compute score method

In [1]:
import pinecone, torch, torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, pipeline, RobertaForSequenceClassification
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm


In [6]:
# test encoders
from sentence_transformers import SentenceTransformer
sentences = "This is an example sentence"

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings.shape)

(384,)


In [40]:
# test cosine similarity
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import numpy as np
import torch
# Define two vectors
vector1 = torch.Tensor([1, 2, 3,1,2])
vector2 = torch.Tensor([[4, 5, 6,8,9], [7,8,9,1,2]])
print(vector2.shape)
vector1 = vector1.reshape(1, -1)
print(vector1)
print(vector2)
vector2 = vector2.reshape(len(vector2), -1)
print(vector2)

# Compute cosine similarity and cosine distance
similarity = cosine_similarity(vector1, vector2)
# distance = cosine_distances([vector1], [vector2])

print("Cosine similarity:", similarity.squeeze())
# print("Cosine distance:", distance)

torch.Size([2, 5])
tensor([[1., 2., 3., 1., 2.]])
tensor([[4., 5., 6., 8., 9.],
        [7., 8., 9., 1., 2.]])
tensor([[4., 5., 6., 8., 9.],
        [7., 8., 9., 1., 2.]])
Cosine similarity: [0.8930478  0.89445674]


In [9]:
# Redefine classifier layer
# source code: https://github.com/huggingface/transformers/blob/84ea427f460ffc8d2ddc08a341ccda076c24fc1f/src/transformers/models/roberta/modeling_roberta.py#L1443
# config: https://huggingface.co/michellejieli/emotion_text_classifier/blob/main/config.json
# not used

class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, 7) # config.num_labels

    def forward(self, features, **kwargs):
        # x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [1]:
class Classifier:
    def __init__(self, lyrics_text_db_index_name: str, lyrics_emo_db_index_name: str, features_db_index_name: str):

        # db
        API_KEY = "12dfbe87-05b3-4243-bb22-e69d329f18ed"
        ENVIRONMENT = "gcp-starter"
        pinecone.init(api_key = API_KEY, environment = ENVIRONMENT)
        self.pinecone_db_lyrics_text = pinecone.Index(lyrics_text_db_index_name) # Access the data through 'pinecone_db' object
        self.pinecone_db_lyrics_emo = pinecone.Index(lyrics_emo_db_index_name) 
        self.pinecone_db_features = pinecone.Index(features_db_index_name)

        # vars
        self.total_score = 0.0
    
    def __call__(self, diary_text: str):    

        # extract embedding
        diary_text = torch.tensor([tok.encode(diary_text)])  
        
        ## textual embedding
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.diary_text_embedding = model.encode(diary_text) # d: 384, dtype: torch FloatTensor

        ## emotional embedding
        config = RobertaConfig.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
        config.output_hidden_states = True
        config.num_labels = 7
        tok = RobertaTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
        model = RobertaModel.from_pretrained("j-hartmann/emotion-english-distilroberta-base", config=config)
        self.diary_emo_embedding = model(diary_text).pooler_output.squeeze() # d: 786, dtype: torch FloatTensor

        # get emotion score and classify
        self.classifier = RobertaForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base", config=config).classifier
        self.diary_emotion_scores = self.clasifier(self.diary_emo_embedding).argmax()
        if self.diary_emotion_scores in [0,1,2,5]: # anger, disgust, fear, sadness
            self.diary_emotion_scores = 0
        elif self.diary_emotion_scores in [3]: # joy
            self.diary_emotion_scores = 1
        else: # neutral, surprise
            self.diary_emotion_scores = 2


    def compute_scores(self, max_k=100, include_values = True):

        # get score
        s1_scores, selected_ids = self.compute_s1(max_k, include_values)
        s2_scores = self.compute_s2(selected_ids)

        # scaling
        total_score = s1_scores * 0.5 + s2_scores * 0.5 # s1_scores.shape: (1,100), s2_scores.shape: (1,100)

        return total_score, selected_ids


    def compute_s1(self, max_k = 100, include_values = True):
        '''
        compute similarity score (1) between diary textual embedding and lyrics textual embedding.
        The dimension of both embeddings is 384.
        '''

        # Retrieve
        s1 = self.pinecone_db_lyrics.query(
        vector=self.diary_text_embedding,
        top_k=max_k,
        include_values = include_values
        )['matches'] # [{id,score,value}, {id,score,value}...]

        # s1_vecs = {} # {id1: value, id2: value, ...}
        s1_scores = [] # {id1: s1_score, id2: s1_score, ...}
        selected_ids = []

        for element in s1:
            # s1_vecs[element['id']] = torch.tensor(element['value'])
            s1_scores.append(torch.tensor(element['score']))
            selected_ids.append(element['id'])

        s1_scores = torch.stack(s1_scores, axis = 1) # 1 x 100 (max_k)

        return s1_scores, selected_ids
        
    def compute_s2(self, selected_ids):
        '''
        compute similarity score (2) the sum of s1 main and s2 sub. Each of them scales to the ranges of [0, 0.375] and [0, 0.125].
        '''

        return self.compute_s2_main(selected_ids = selected_ids)*0.75 + self.compute_s2_sub(selected_ids = selected_ids)*0.25
        

    def compute_s2_main(self, selected_ids, include_values = True):
        '''
        compute similarity score between diary emotion embedding and lyrics emotion embedding.
        The dimension of both embeddings is 786.
        It scales to [0, 0.375].
        '''

        # Retrieve
        s2_main = [] # 100
        
        for id in selected_ids:
            s2_main.append(self.pinecone_db_lyrics_emo.query(
                id = id,
                include_values = include_values
            )['matches']['value']) # [embs, embs, ...], (100, 768)
        
        s2_main = torch.FloatTensor(s2_main) # (100,768)
        
        return cosine_similarity(self.diary_emo_embedding.reshape(1,-1), s2_main.T) # (1,768) x (768, 100) =>  1 x 100 

    def compute_s2_sub(self, selected_ids, include_values = True):
        '''
        indicator function for music feature. 
        if the music feature label (0,1,2) matches the query emotion label (0,1,2), then 1 otherwise 0.
        It scalse to [0, 0.125]
        '''

        # Retrieve
        s2_sub = [] # 100

        for id in selected_ids:
            s2_sub.append(self.pinecone_db_features.query(
                id = id,
                include_values = include_values,
            )['matches']['values']) # [label, label, ...] (100,1)

        s2_sub = torch.FloatTensor(s2_sub).reshape(1,100) # (1,100)
        s2_sub = torch.eq(torch.ones(s2_sub.shape[1])*self.diary_emotion_scores, s2_sub)*1
        
        return s2_sub # (1,100)
             
    
    def return_topk_music(self, scores, selected_ids, top_k = 5):
        idxs = torch.argmax(scores, top_n = top_k).to_numpy()
        topk_music_ids = selected_ids[idxs]
        
        return topk_music_ids

In [None]:
# Example
lyrics_db_index_name = 'your lyrics db index name'
features_db_index_name = 'your features db index name'

classifier = Classifier(lyrics_db_index_name, features_db_index_name)
classifier('I mean I just needed a little sympathy, but no one gave me a single comfort word.')
scores, selected_ids = classifier.compute_scores()
topk_music_ids = classifier.return_topk_music(scores, selected_ids, top_k = 10) 

# You can retrieve the music from the pinecone db by using index afterwards.